pageserver: enable setting a target disk range

pageserver: publish disk eviction status
2026-03-07 02:10:37 +00:00 · 2023-10-25 14:39:12 +01:00 · 2023-10-25 14:35:32 +01:00
222 changed files with 9612 additions and 12690 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,11 +22,5 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]
 [final-excludes]
 # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
 # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
 # from depending on workspace-hack because most of the dependencies are not used.
 workspace-members = ["vm_monitor"]
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,9 +17,8 @@ assignees: ''
 ## Implementation ideas
-```[tasklist]
+## Tasks
-### Tasks
+- [ ]
 ```
 ## Other related tasks and Epics
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,6 +5,4 @@ self-hosted-runner:
    - small
    - us-east-2
 config-variables:
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,10 +203,6 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
          exit 0
        fi
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -340,11 +340,11 @@ jobs:
          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
          export REMOTE_STORAGE_AZURE_REGION=eastus2
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
@@ -433,7 +433,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
      - name: Merge and upload coverage data
@@ -468,7 +468,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
@@ -723,7 +723,6 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -848,7 +847,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.5
+      VM_BUILDER_VERSION: v0.18.2
    steps:
      - name: Checkout
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch
 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 7 * * 2'
  workflow_dispatch:
 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -170,12 +170,6 @@ dependencies = [
 "backtrace",
 ]
 [[package]]
 name = "arc-swap"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
 [[package]]
 name = "archery"
 version = "0.5.0"
@@ -1615,6 +1609,16 @@ dependencies = [
 "subtle",
 ]
 [[package]]
 name = "ctor"
 version = "0.1.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
 dependencies = [
 "quote",
 "syn 1.0.109",
 ]
 [[package]]
 name = "ctr"
 version = "0.6.0"
@@ -2710,10 +2714,11 @@ dependencies = [
 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
 dependencies = [
 "cfg-if",
 "value-bag",
 ]
@@ -3556,7 +3561,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3569,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3580,7 +3585,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3598,7 +3603,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4064,7 +4069,6 @@ dependencies = [
 "aws-config",
 "aws-credential-types",
 "aws-sdk-s3",
 "aws-smithy-async",
 "aws-smithy-http",
 "aws-types",
 "azure_core",
@@ -4426,7 +4430,6 @@ dependencies = [
 "itertools",
 "pageserver",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
@@ -4485,7 +4488,6 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
 "tokio-stream",
 "toml_edit",
 "tracing",
 "url",
@@ -4688,16 +4690,6 @@ dependencies = [
 "serde_derive",
 ]
 [[package]]
 name = "serde_assert"
 version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eda563240c1288b044209be1f0d38bb4d15044fb3e00dc354fbc922ab4733e80"
 dependencies = [
 "hashbrown 0.13.2",
 "serde",
 ]
 [[package]]
 name = "serde_derive"
 version = "1.0.183"
@@ -5415,7 +5407,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -5958,7 +5950,6 @@ name = "utils"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "arc-swap",
 "async-trait",
 "bincode",
 "byteorder",
@@ -5985,7 +5976,6 @@ dependencies = [
 "routerify",
 "sentry",
 "serde",
 "serde_assert",
 "serde_json",
 "serde_with",
 "signal-hook",
@@ -6021,9 +6011,13 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 [[package]]
 name = "value-bag"
-version = "1.4.2"
+version = "1.0.0-alpha.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe"
+checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
 dependencies = [
 "ctor",
 "version_check",
 ]
 [[package]]
 name = "vcpkg"
@@ -6056,6 +6050,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
 "workspace_hack",
 ]
 [[package]]
@@ -6483,7 +6478,6 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
 "dashmap",
 "either",
 "fail",
 "futures",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,6 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
@@ -48,7 +47,6 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
 aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
@@ -67,7 +65,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -126,7 +124,6 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
@@ -164,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -205,7 +202,7 @@ tonic-build = "0.9"
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
 ################# Binary contents sections
--- a/5
+++ b/5
@@ -27,7 +27,6 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -79,9 +78,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,10 +72,6 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
 	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
 		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
 		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
 		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -156,7 +156,6 @@ fn main() -> Result<()> {
                let path = Path::new(sp);
                let file = File::open(path)?;
                spec = Some(serde_json::from_reader(file)?);
                live_config_allowed = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
@@ -278,26 +277,32 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
+            use tracing::warn;
-                .get_one::<String>("vm-monitor-addr")
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = if env::var_os("AUTOSCALING").is_some() {
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
-                Some(
+                (None, None) => None,
                (None, Some(_)) => {
                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
                    None
                }
                (Some(_), None) => {
                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
                }
                (Some(_), Some(_)) => Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor")
+                        .expect("failed to create tokio runtime for monitor"),
-                )
+                ),
            } else {
                None
            };
            // This token is used internally by the monitor to clean up all threads
@@ -308,7 +313,8 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.clone(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -480,8 +486,6 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
            // DEPRECATED, NO LONGER DOES ANYTHING.
            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,12 +710,8 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self) -> Result<()> {
+    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        client.simple_query("SELECT pg_reload_conf()")?;
        Command::new(pgctl_bin)
            .args(["reload", "-D", &self.pgdata])
            .output()
            .expect("cannot run pg_ctl process");
        Ok(())
    }
@@ -728,9 +724,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
        self.pg_reload_conf()?;
        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -78,7 +78,7 @@ use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::Path;
 use std::str;
 use tar::Archive;
@@ -281,6 +281,8 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
        max_keys_per_list_response: None,
    };
    let config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
        storage: RemoteStorageKind::AwsS3(config),
    };
    GenericRemoteStorage::from_config(&config)
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
 //!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-#![deny(unsafe_code)]
+//!
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query(
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
            &[],
        )?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
            replication: Some(row.get("rolreplication")),
            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", format!("Bearer {}", jwt))
+        .header("Authorization", jwt)
        .send()
        .map_err(|e| {
            (
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
 ) -> Result<Option<ComputeSpec>> {
-    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
@@ -265,8 +265,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
                || !r.bypassrls.unwrap_or(false)
                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -298,8 +296,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String =
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,6 +2,7 @@ use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};
@@ -13,10 +14,12 @@ pub struct AttachmentService {
 const COMMAND: &str = "attachment_service";
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    pub node_id: Option<NodeId>,
+    pub pageserver_id: Option<NodeId>,
 }
 #[derive(Serialize, Deserialize)]
@@ -82,7 +85,7 @@ impl AttachmentService {
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach-hook")
+            .join("attach_hook")
            .unwrap();
        let client = reqwest::blocking::ClientBuilder::new()
            .build()
@@ -90,7 +93,7 @@ impl AttachmentService {
        let request = AttachHookRequest {
            tenant_id,
-            node_id: Some(pageserver_id),
+            pageserver_id: Some(pageserver_id),
        };
        let response = client.post(url).json(&request).send()?;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY:
+    // SAFETY
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,7 +12,6 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
 use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
 use utils::signals::{ShutdownSignals, Signal};
@@ -172,7 +171,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
                id: *t,
-                gen: state.generation,
+                generation: state.generation,
            });
        }
    }
@@ -218,31 +217,14 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
        .tenants
        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
-            pageserver: attach_req.node_id,
+            pageserver: attach_req.pageserver_id,
            generation: 0,
        });
-    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
+    if attach_req.pageserver_id.is_some() {
        tenant_state.generation += 1;
        tracing::info!(
            tenant_id = %attach_req.tenant_id,
            ps_id = %attaching_pageserver,
            generation = %tenant_state.generation,
            "issuing",
        );
    } else if let Some(ps_id) = tenant_state.pageserver {
        tracing::info!(
            tenant_id = %attach_req.tenant_id,
            %ps_id,
            generation = %tenant_state.generation,
            "dropping",
        );
    } else {
        tracing::info!(
            tenant_id = %attach_req.tenant_id,
            "no-op: tenant already has no pageserver");
    }
-    tenant_state.pageserver = attach_req.node_id;
+    tenant_state.pageserver = attach_req.pageserver_id;
    let generation = tenant_state.generation;
    locked.save().await.map_err(ApiError::InternalServerError)?;
@@ -250,7 +232,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    json_response(
        StatusCode::OK,
        AttachHookResponse {
-            gen: attach_req.node_id.map(|_| generation),
+            gen: attach_req.pageserver_id.map(|_| generation),
        },
    )
 }
@@ -258,9 +240,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/re-attach", handle_re_attach)
-        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/validate", handle_validate)
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/attach_hook", handle_attach_hook)
 }
 #[tokio::main]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -798,24 +798,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "reconfigure" => {
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
            let endpoint = cplane
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
            let pageserver_id =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
                    Some(NodeId(
                        id_str.parse().context("while parsing pageserver id")?,
                    ))
                } else {
                    None
                };
            endpoint.reconfigure(pageserver_id)?;
        }
        "stop" => {
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
@@ -1387,12 +1369,6 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
                            .arg(endpoint_pageserver_id_arg)
                            .arg(endpoint_id_arg.clone())
                            .arg(tenant_id_arg.clone())
                )
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,7 @@ use std::time::Duration;
 use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId, TimelineId};
 use crate::local_env::LocalEnv;
@@ -56,10 +57,13 @@ use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
 // contents of a endpoint.json file
 #[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
    endpoint_id: String,
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
@@ -410,32 +414,16 @@ impl Endpoint {
            );
        }
-        Ok(())
+        // Also wait for the compute_ctl process to die. It might have some cleanup
-    }
+        // work to do after postgres stops, like syncing safekeepers, etc.
-
+        //
    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
        Ok(())
    }
-    fn read_postgresql_conf(&self) -> Result<String> {
+        Ok(())
        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
        // memory. We will include it in the spec file that we pass to
        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
        // in the data directory.
        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
        match std::fs::read(&postgresql_conf_path) {
            Ok(content) => Ok(String::from_utf8(content)?),
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
            Err(e) => Err(anyhow::Error::new(e).context(format!(
                "failed to read config file in {}",
                postgresql_conf_path.to_str().unwrap()
            ))),
        }
    }
    pub fn start(
@@ -448,7 +436,21 @@ impl Endpoint {
            anyhow::bail!("The endpoint is already running");
        }
-        let postgresql_conf = self.read_postgresql_conf()?;
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
        // memory. We will include it in the spec file that we pass to
        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
        // in the data directory.
        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
            Ok(content) => String::from_utf8(content)?,
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
            Err(e) => {
                return Err(anyhow::Error::new(e).context(format!(
                    "failed to read config file in {}",
                    postgresql_conf_path.to_str().unwrap()
                )))
            }
        };
        // We always start the compute node from scratch, so if the Postgres
        // data dir exists from a previous launch, remove it first.
@@ -619,61 +621,6 @@ impl Endpoint {
        }
    }
    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
            serde_json::from_reader(file)?
        };
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);
        if let Some(pageserver_id) = pageserver_id {
            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
            let mut endpoint_conf: EndpointConf = {
                let file = std::fs::File::open(&endpoint_config_path)?;
                serde_json::from_reader(file)?
            };
            endpoint_conf.pageserver_id = pageserver_id;
            std::fs::write(
                endpoint_config_path,
                serde_json::to_string_pretty(&endpoint_conf)?,
            )?;
            let pageserver =
                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
            let ps_http_conf = &pageserver.pg_connection_config;
            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
        }
        let client = reqwest::blocking::Client::new();
        let response = client
            .post(format!(
                "http://{}:{}/configure",
                self.http_address.ip(),
                self.http_address.port()
            ))
            .body(format!(
                "{{\"spec\":{}}}",
                serde_json::to_string_pretty(&spec)?
            ))
            .send()?;
        let status = response.status();
        if !(status.is_client_error() || status.is_server_error()) {
            Ok(())
        } else {
            let url = response.url().to_owned();
            let msg = match response.text() {
                Ok(err_body) => format!("Error: {}", err_body),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
            Err(anyhow::anyhow!(msg))
        }
    }
    pub fn stop(&self, destroy: bool) -> Result<()> {
        // If we are going to destroy data directory,
        // use immediate shutdown mode, otherwise,
@@ -682,25 +629,15 @@ impl Endpoint {
        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
        self.pg_ctl(
            if destroy {
                &["-m", "immediate", "stop"]
            } else {
                &["stop"]
            },
            &None,
        )?;
        // Also wait for the compute_ctl process to die. It might have some cleanup
        // work to do after postgres stops, like syncing safekeepers, etc.
        //
        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
            self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
            println!(
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
        } else {
            self.pg_ctl(&["stop"], &None)?;
        }
        Ok(())
    }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,10 +1,11 @@
-//! Local control plane.
+//
-//!
+// Local control plane.
-//! Can start, configure and stop postgres instances running as a local processes.
+//
-//!
+// Can start, configure and stop postgres instances running as a local processes.
-//! Intended to be used in integration tests and in CLI tools for
+//
-//! local installations.
+// Intended to be used in integration tests and in CLI tools for
-#![deny(clippy::undocumented_unsafe_blocks)]
+// local installations.
 //
 pub mod attachment_service;
 mod background_process;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context};
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -32,6 +33,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
 #[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -57,6 +59,7 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub default_tenant_id: Option<TenantId>,
    // used to issue tokens during e.g pg start
@@ -81,6 +84,7 @@ pub struct LocalEnv {
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -1,108 +0,0 @@
 # Updating Postgres
 ## Minor Versions
 When upgrading to a new minor version of Postgres, please follow these steps:
 _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Clone the Neon Postgres repository if you have not done so already.
    ```shell
    git clone git@github.com:neondatabase/postgres.git
    ```
 1. Add the Postgres upstream remote.
    ```shell
    git remote add upstream https://git.postgresql.org/git/postgresql.git
    ```
 1. Create a new branch based on the stable branch you are updating.
    ```shell
    git checkout -b my-branch REL_15_STABLE_neon
    ```
 1. Tag the last commit on the stable branch you are updating.
    ```shell
    git tag REL_15_3_neon
    ```
 1. Push the new tag to the Neon Postgres repository.
    ```shell
    git push origin REL_15_3_neon
    ```
 1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
 1. Rebase the branch you created on the tag and resolve any conflicts.
    ```shell
    git fetch upstream REL_15_4
    git rebase REL_15_4
    ```
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.
    ```shell
    make check
    # OR
    meson test -C builddir
    ```
 1. Push your branch to the Neon Postgres repository.
    ```shell
    git push origin my-branch
    ```
 1. Clone the Neon repository if you have not done so already.
    ```shell
    git clone git@github.com:neondatabase/neon.git
    ```
 1. Create a new branch.
 1. Change the `revisions.json` file to point at the HEAD of your Postgres
 branch.
 1. Update the Git submodule.
    ```shell
    git submodule set-branch --branch my-branch vendor/postgres-v15
    git submodule update --remote vendor/postgres-v15
    ```
 1. Run the Neon test suite to make sure that Neon is still good to go on this
 minor Postgres release.
    ```shell
    ./scripts/poetry -k pg15
    ```
 1. Commit your changes.
 1. Create a pull request, and wait for CI to go green.
 1. Force push the rebased Postgres branches into the Neon Postgres repository.
    ```shell
    git push --force origin my-branch:REL_15_STABLE_neon
    ```
    It may require disabling various branch protections.
 1. Update your Neon PR to point at the branches.
    ```shell
    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
    git commit --amend --no-edit
    git push --force origin
    ```
 1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,5 +1,3 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,6 +6,7 @@
 use std::collections::HashMap;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -18,6 +19,7 @@ pub type PgIdent = String;
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -48,12 +50,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub timeline_id: Option<TimelineId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub pageserver_connstring: Option<String>,
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,
@@ -138,13 +140,14 @@ impl RemoteExtSpec {
    }
 }
 #[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(Lsn),
+    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
@@ -187,8 +190,6 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
    pub replication: Option<bool>,
    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
 //!
 //! Shared code for consumption metics collection
-#![deny(unsafe_code)]
+//!
 #![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,7 +2,6 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
@@ -90,14 +89,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];
-pub fn set_build_info_metric(revision: &str, build_tag: &str) {
+pub fn set_build_info_metric(revision: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
        "Build/version information",
-        &["revision", "build_tag"]
+        &["revision"]
    )
    .expect("Failed to register build info metric");
-    metric.with_label_values(&[revision, build_tag]).set(1);
+    metric.with_label_values(&[revision]).set(1);
 }
 // Records I/O stats in a "cross-platform" way.
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,6 +4,7 @@
 //! See docs/rfcs/025-generation-numbers.md
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};
 #[derive(Serialize, Deserialize)]
@@ -11,10 +12,12 @@ pub struct ReAttachRequest {
    pub node_id: NodeId,
 }
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
-    pub gen: u32,
+    pub generation: u32,
 }
 #[derive(Serialize, Deserialize)]
@@ -22,8 +25,10 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -38,8 +43,10 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,5 +1,3 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
 /// Public API types
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{
 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::serde_as;
+use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
    completion,
@@ -110,6 +110,7 @@ impl TenantState {
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
@@ -174,19 +175,25 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -195,6 +202,7 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 #[serde_as]
 #[derive(Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLoadRequest {
@@ -271,26 +279,31 @@ pub struct LocationConfig {
    pub tenant_conf: TenantConfig,
 }
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(pub TenantId);
+pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -362,8 +375,10 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }
 #[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -374,22 +389,33 @@ pub struct TenantInfo {
 }
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// The LSN that we have succesfully uploaded to remote storage
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    /// The LSN that we are advertizing to safekeepers
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn_visible: Lsn,
    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -401,6 +427,7 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,
    pub wal_source_connstr: Option<String>,
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
@@ -497,13 +524,23 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }
 #[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open { lsn_start: Lsn },
+    Open {
-    Frozen { lsn_start: Lsn, lsn_end: Lsn },
+        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
    },
    Frozen {
        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
    },
 }
 #[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -511,7 +548,9 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,
        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -520,6 +559,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,
        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,8 +2,6 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -17,7 +15,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace, warn};
+use tracing::{debug, error, info, trace};
 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -35,11 +33,6 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
    /// Authentication failure
    #[error("Unauthorized: {0}")]
    Unauthorized(std::borrow::Cow<'static, str>),
    #[error("Simulated Connection Error")]
    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -54,9 +47,8 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Disconnected(_) => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -250,7 +242,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        }
    }
    /// Cancellation safe as long as the underlying IO is cancellation safe.
    async fn shutdown(&mut self) -> io::Result<()> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -402,23 +393,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        shutdown_watcher: F,
    ) -> Result<(), QueryError>
    where
-        F: Fn() -> S + Clone,
+        F: Fn() -> S,
        S: Future,
    {
-        let ret = self
+        let ret = self.run_message_loop(handler, shutdown_watcher).await;
-            .run_message_loop(handler, shutdown_watcher.clone())
+        // socket might be already closed, e.g. if previously received error,
-            .await;
+        // so ignore result.
-
+        self.framed.shutdown().await.ok();
        tokio::select! {
            _ = shutdown_watcher() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
                // socket might be already closed, e.g. if previously received error,
                // so ignore result.
            },
        }
        match ret {
            Ok(()) => Ok(()),
            Err(QueryError::Shutdown) => {
@@ -616,7 +597,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &short_error(&e),
+                            &e.to_string(),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -736,20 +717,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    match e {
+                    log_query_error(query_string, &e);
-                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                    let short_error = short_error(&e);
-                        QueryError::SimulatedConnectionError => {
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                            return Err(QueryError::SimulatedConnectionError)
+                        &short_error,
-                        }
+                        Some(e.pg_error_code()),
-                        e => {
+                    ))?;
                            log_query_error(query_string, &e);
                            let short_error = short_error(&e);
                            self.write_message_noflush(&BeMessage::ErrorResponse(
                                &short_error,
                                Some(e.pg_error_code()),
                            ))?;
                        }
                    }
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
@@ -975,8 +948,6 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -993,15 +964,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
        QueryError::SimulatedConnectionError => {
            error!("query handler for query '{query}' failed due to a simulated connection error")
        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
        QueryError::Unauthorized(e) => {
            warn!("query handler for '{query}' failed with authentication error: {e}");
        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,5 +1,3 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,7 +8,6 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -21,7 +20,6 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
                #![allow(clippy::undocumented_unsafe_blocks)]
                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -14,7 +14,6 @@ macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -214,24 +214,27 @@ where
    }
 }
 /// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn flush<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
 ) -> Result<(), io::Error> {
    while write_buf.has_remaining() {
-        let bytes_written = stream.write_buf(write_buf).await?;
+        let bytes_written = stream.write(write_buf.chunk()).await?;
        if bytes_written == 0 {
            return Err(io::Error::new(
                ErrorKind::WriteZero,
                "failed to write message",
            ));
        }
        // The advanced part will be garbage collected, likely during shifting
        // data left on next attempt to write to buffer when free space is not
        // enough.
        write_buf.advance(bytes_written);
    }
    write_buf.clear();
    stream.flush().await
 }
 /// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn shutdown<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,7 +1,6 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod framed;
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
 aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,18 +1,21 @@
 //! Azure Blob Storage wrapper
 use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::sync::Arc;
-use std::{borrow::Cow, io::Cursor};
+use std::{borrow::Cow, collections::HashMap, io::Cursor};
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
 use azure_core::Header;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use azure_storage_blobs::{
    blob::operations::GetBlobBuilder,
    prelude::{BlobClient, ContainerClient},
 };
 use futures_util::StreamExt;
 use http_types::StatusCode;
 use tokio::io::AsyncRead;
@@ -20,8 +23,8 @@ use tracing::debug;
 use crate::s3_bucket::RequestKind;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
-    RemoteStorage, StorageMetadata,
+    StorageMetadata,
 };
 pub struct AzureBlobStorage {
@@ -109,19 +112,16 @@ impl AzureBlobStorage {
    async fn download_for_builder(
        &self,
        metadata: StorageMetadata,
        builder: GetBlobBuilder,
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();
        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
            if let Some(blob_meta) = part.blob.metadata {
                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
            }
            let data = part
                .data
                .collect()
@@ -131,9 +131,28 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(StorageMetadata(metadata)),
+            metadata: Some(metadata),
        })
    }
    // TODO get rid of this function once we have metadata included in the response
    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
    async fn get_metadata(
        &self,
        blob_client: &BlobClient,
    ) -> Result<StorageMetadata, DownloadError> {
        let builder = blob_client.get_metadata();
        let response = builder.into_future().await.map_err(to_download_error)?;
        let mut map = HashMap::new();
        for md in response.metadata.iter() {
            map.insert(
                md.name().as_str().to_string(),
                md.value().as_str().to_string(),
            );
        }
        Ok(StorageMetadata(map))
    }
    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
        self.concurrency_limiter
@@ -165,11 +184,10 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
 #[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
-    async fn list(
+    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
    ) -> anyhow::Result<Listing, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_name(p))
@@ -177,19 +195,16 @@ impl RemoteStorage for AzureBlobStorage {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });
-        let mut builder = self.client.list_blobs();
+        let mut builder = self
-
+            .client
-        if let ListingMode::WithDelimiter = mode {
+            .list_blobs()
-            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
        }
        if let Some(prefix) = list_prefix {
            builder = builder.prefix(Cow::from(prefix.to_owned()));
@@ -200,23 +215,46 @@ impl RemoteStorage for AzureBlobStorage {
        }
        let mut response = builder.into_stream();
-        let mut res = Listing::default();
+        let mut res = Vec::new();
-        while let Some(l) = response.next().await {
+        while let Some(entry) = response.next().await {
-            let entry = l.map_err(to_download_error)?;
+            let entry = entry.map_err(to_download_error)?;
-            let prefix_iter = entry
+            let name_iter = entry
                .blobs
                .prefixes()
                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.prefixes.extend(prefix_iter);
+            res.extend(name_iter);
            let blob_iter = entry
                .blobs
                .blobs()
                .map(|k| self.name_to_relative_path(&k.name));
            res.keys.extend(blob_iter);
        }
        Ok(res)
    }
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let folder_name = folder
            .map(|p| self.relative_path_to_name(p))
            .or_else(|| self.prefix_in_container.clone());
        let mut builder = self.client.list_blobs();
        if let Some(folder_name) = folder_name {
            builder = builder.prefix(Cow::from(folder_name.to_owned()));
        }
        if let Some(limit) = self.max_keys_per_list_response {
            builder = builder.max_results(MaxResults::new(limit));
        }
        let mut response = builder.into_stream();
        let mut res = Vec::new();
        while let Some(l) = response.next().await {
            let entry = l.map_err(anyhow::Error::new)?;
            let name_iter = entry
                .blobs
                .blobs()
                .map(|bl| self.name_to_relative_path(&bl.name));
            res.extend(name_iter);
        }
        Ok(res)
    }
    async fn upload(
        &self,
        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
@@ -250,9 +288,11 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
        let metadata = self.get_metadata(&blob_client).await?;
        let builder = blob_client.get();
-        self.download_for_builder(builder).await
+        self.download_for_builder(metadata, builder).await
    }
    async fn download_byte_range(
@@ -264,6 +304,8 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
        let metadata = self.get_metadata(&blob_client).await?;
        let mut builder = blob_client.get();
        if let Some(end_exclusive) = end_exclusive {
@@ -278,7 +320,7 @@ impl RemoteStorage for AzureBlobStorage {
            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        }
-        self.download_for_builder(builder).await
+        self.download_for_builder(metadata, builder).await
    }
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,15 +6,19 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
-use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
+use std::{
    collections::HashMap,
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    pin::Pin,
    sync::Arc,
 };
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -30,6 +34,12 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
 /// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
 /// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
 pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
 pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -119,22 +129,6 @@ impl RemotePath {
    }
 }
 /// We don't need callers to be able to pass arbitrary delimiters: just control
 /// whether listings will use a '/' separator or not.
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
 pub enum ListingMode {
    WithDelimiter,
    NoDelimiter,
 }
 #[derive(Default)]
 pub struct Listing {
    pub prefixes: Vec<RemotePath>,
    pub keys: Vec<RemotePath>,
 }
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -147,13 +141,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+    ) -> Result<Vec<RemotePath>, DownloadError>;
-        let result = self
+
            .list(prefix, ListingMode::WithDelimiter)
            .await?
            .prefixes;
        Ok(result)
    }
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
@@ -165,16 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
        Ok(result)
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        _mode: ListingMode,
    ) -> anyhow::Result<Listing, DownloadError>;
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -225,9 +205,6 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
    /// A cancellation token aborted the download, typically during
    /// tenant detach or process shutdown.
    Cancelled,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -238,7 +215,6 @@ impl std::fmt::Display for DownloadError {
            DownloadError::BadInput(e) => {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
@@ -258,19 +234,6 @@ pub enum GenericRemoteStorage {
 }
 impl GenericRemoteStorage {
    pub async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
    ) -> anyhow::Result<Listing, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list(prefix, mode).await,
            Self::AwsS3(s) => s.list(prefix, mode).await,
            Self::AzureBlob(s) => s.list(prefix, mode).await,
            Self::Unreliable(s) => s.list(prefix, mode).await,
        }
    }
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
@@ -431,6 +394,10 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
    pub max_concurrent_syncs: NonZeroUsize,
    /// Max allowed errors before the sync task is considered failed and evicted.
    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -526,6 +493,18 @@ impl RemoteStorageConfig {
        let use_azure = container_name.is_some() && container_region.is_some();
        let max_concurrent_syncs = NonZeroUsize::new(
            parse_optional_integer("max_concurrent_syncs", toml)?
                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
        )
        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
        let max_sync_errors = NonZeroU32::new(
            parse_optional_integer("max_sync_errors", toml)?
                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
        )
        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -607,7 +586,11 @@ impl RemoteStorageConfig {
            }
        };
-        Ok(Some(RemoteStorageConfig { storage }))
+        Ok(Some(RemoteStorageConfig {
            max_concurrent_syncs,
            max_sync_errors,
            storage,
        }))
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -15,7 +15,7 @@ use tokio::{
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, RemotePath};
 use super::{RemoteStorage, StorageMetadata};
@@ -75,7 +75,7 @@ impl LocalFs {
    }
    #[cfg(test)]
-    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
@@ -89,10 +89,52 @@ impl LocalFs {
            })
            .collect())
    }
 }
 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let path = match prefix {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
        let prefixes_to_filter = get_all_files(path.as_ref(), false)
            .await
            .map_err(DownloadError::Other)?;
        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
        // filter out empty directories to mirror s3 behavior.
        for prefix in prefixes_to_filter {
            if prefix.is_dir()
                && is_directory_empty(&prefix)
                    .await
                    .map_err(DownloadError::Other)?
            {
                continue;
            }
            prefixes.push(
                prefix
                    .strip_prefix(&self.storage_root)
                    .context("Failed to strip prefix")
                    .and_then(RemotePath::new)
                    .expect(
                        "We list files for storage root, hence should be able to remote the prefix",
                    ),
            )
        }
        Ok(prefixes)
    }
    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
-    async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
@@ -144,70 +186,6 @@ impl LocalFs {
        Ok(files)
    }
 }
 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
    ) -> Result<Listing, DownloadError> {
        let mut result = Listing::default();
        if let ListingMode::NoDelimiter = mode {
            let keys = self
                .list_recursive(prefix)
                .await
                .map_err(DownloadError::Other)?;
            result.keys = keys
                .into_iter()
                .filter(|k| {
                    let path = k.with_base(&self.storage_root);
                    !path.is_dir()
                })
                .collect();
            return Ok(result);
        }
        let path = match prefix {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
        let prefixes_to_filter = get_all_files(path.as_ref(), false)
            .await
            .map_err(DownloadError::Other)?;
        // filter out empty directories to mirror s3 behavior.
        for prefix in prefixes_to_filter {
            if prefix.is_dir()
                && is_directory_empty(&prefix)
                    .await
                    .map_err(DownloadError::Other)?
            {
                continue;
            }
            let stripped = prefix
                .strip_prefix(&self.storage_root)
                .context("Failed to strip prefix")
                .and_then(RemotePath::new)
                .expect(
                    "We list files for storage root, hence should be able to remote the prefix",
                );
            if prefix.is_dir() {
                result.prefixes.push(stripped);
            } else {
                result.keys.push(stripped);
            }
        }
        Ok(result)
    }
    async fn upload(
        &self,
@@ -501,7 +479,7 @@ mod fs_tests {
        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
-            storage.list_all().await?,
+            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );
@@ -689,7 +667,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
        storage.delete(&upload_target).await?;
-        assert!(storage.list_all().await?.is_empty());
+        assert!(storage.list().await?.is_empty());
        storage
            .delete(&upload_target)
@@ -747,43 +725,6 @@ mod fs_tests {
        Ok(())
    }
    #[tokio::test]
    async fn list() -> anyhow::Result<()> {
        // No delimiter: should recursively list everything
        let storage = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
        assert!(listing.prefixes.is_empty());
        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
        // Delimiter: should only go one deep
        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("timelines").unwrap()].to_vec()
        );
        assert!(listing.keys.is_empty());
        // Delimiter & prefix
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                ListingMode::WithDelimiter,
            )
            .await?;
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
                .to_vec()
        );
        assert_eq!(listing.keys, [uncle.clone()].to_vec());
        Ok(())
    }
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
@@ -836,7 +777,7 @@ mod fs_tests {
    }
    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
-        let mut files = storage.list_all().await?;
+        let mut files = storage.list().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,27 +4,23 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
-use std::{borrow::Cow, sync::Arc};
+use std::borrow::Cow;
 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
    provider_config::ProviderConfig,
    retry::{RetryConfigBuilder, RetryMode},
    web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
+    config::{Config, Region},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -34,8 +30,8 @@ use tracing::debug;
 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
-    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 pub(super) mod metrics;
@@ -87,23 +83,10 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };
        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
        let mut retry_config = RetryConfigBuilder::new();
        retry_config
            .set_max_attempts(Some(1))
            .set_mode(Some(RetryMode::Adaptive));
        let mut config_builder = Config::builder()
            .region(region)
            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider)
+            .credentials_provider(credentials_provider);
            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
            .retry_config(retry_config.build());
        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
@@ -316,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    async fn list(
+    /// See the doc for `RemoteStorage::list_prefixes`
    /// Note: it wont include empty "directories"
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
        let mut result = Listing::default();
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -331,33 +314,28 @@ impl RemoteStorage for S3Bucket {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });
        let mut document_keys = Vec::new();
        let mut continuation_token = None;
        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);
-            let mut request = self
+            let fetch_response = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response);
+                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
-
+                .set_max_keys(self.max_keys_per_list_response)
            if let ListingMode::WithDelimiter = mode {
                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
            }
            let response = request
                .send()
                .await
                .context("Failed to list S3 prefixes")
@@ -367,35 +345,71 @@ impl RemoteStorage for S3Bucket {
            metrics::BUCKET_METRICS
                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
+                .observe_elapsed(kind, &fetch_response, started_at);
-            let response = response?;
+            let fetch_response = fetch_response?;
-            let keys = response.contents().unwrap_or_default();
+            document_keys.extend(
-            let empty = Vec::new();
+                fetch_response
-            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
+                    .common_prefixes
-
+                    .unwrap_or_default()
-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+                    .into_iter()
            for object in keys {
                let object_path = object.key().expect("response does not contain a key");
                let remote_path = self.s3_object_to_relative_path(object_path);
                result.keys.push(remote_path);
            }
            result.prefixes.extend(
                prefixes
                    .iter()
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );
-            continuation_token = match response.next_continuation_token {
+            continuation_token = match fetch_response.next_continuation_token {
                Some(new_token) => Some(new_token),
                None => break,
            };
        }
-        Ok(result)
+        Ok(document_keys)
    }
    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let kind = RequestKind::List;
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
        // AWS may need to break the response into several parts
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);
            let response = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(folder_name.clone())
                .set_continuation_token(continuation_token)
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
                .context("Failed to list files in S3 bucket");
            let started_at = ScopeGuard::into_inner(started_at);
            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &response, started_at);
            let response = response?;
            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
                let remote_path = self.s3_object_to_relative_path(object_path);
                all_files.push(remote_path);
            }
            match response.next_continuation_token {
                Some(new_token) => continuation_token = Some(new_token),
                None => break,
            }
        }
        Ok(all_files)
    }
    async fn upload(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -5,9 +5,7 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
-use crate::{
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
 };
 pub struct UnreliableWrapper {
    inner: crate::GenericRemoteStorage,
@@ -97,15 +95,6 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_files(folder).await
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
    ) -> Result<Listing, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
        self.inner.list(prefix, mode).await
    }
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -469,6 +469,8 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -396,6 +396,8 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,5 +1,3 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
 /// Public API types
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,18 +1,23 @@
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::{
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub peer_ids: Option<Vec<NodeId>>,
    pub pg_version: u32,
    pub system_id: Option<u64>,
    pub wal_seg_size: Option<u32>,
    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    // If not passed, it is assigned to the beginning of commit_lsn segment.
    pub local_start_lsn: Option<Lsn>,
@@ -23,6 +28,7 @@ fn lsn_invalid() -> Lsn {
 }
 /// Data about safekeeper's timeline, mirrors broker.proto.
 #[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
    /// Term.
@@ -30,19 +36,25 @@ pub struct SkTimelineInfo {
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub commit_lsn: Lsn,
    /// LSN up to which safekeeper has backed WAL.
    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub backup_lsn: Lsn,
    /// LSN of last checkpoint uploaded by pageserver.
    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub remote_consistent_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub peer_horizon_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub local_start_lsn: Lsn,
    /// A connection string to use for WAL receiving.
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,6 +1,4 @@
 //! Synthetic size calculation
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,8 +32,6 @@
 //!         .init();
 //! }
 //! ```
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
@@ -56,7 +55,6 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,8 +1,7 @@
 // For details about authentication see docs/authentication.md
 use arc_swap::ArcSwap;
 use serde;
-use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
+use std::fs;
 use anyhow::Result;
 use camino::Utf8Path;
@@ -10,8 +9,9 @@ use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use crate::{http::error::ApiError, id::TenantId};
+use crate::id::TenantId;
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -32,9 +32,11 @@ pub enum Scope {
 }
 /// JWT payload. See docs/authentication.md for the format
 #[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    pub scope: Scope,
 }
@@ -45,106 +47,31 @@ impl Claims {
    }
 }
 pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
 impl SwappableJwtAuth {
    pub fn new(jwt_auth: JwtAuth) -> Self {
        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
    }
    pub fn swap(&self, jwt_auth: JwtAuth) {
        self.0.swap(Arc::new(jwt_auth));
    }
    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
        self.0.load().decode(token)
    }
 }
 impl std::fmt::Debug for SwappableJwtAuth {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "Swappable({:?})", self.0.load())
    }
 }
 #[derive(Clone, PartialEq, Eq, Hash, Debug)]
 pub struct AuthError(pub Cow<'static, str>);
 impl Display for AuthError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
 }
 impl From<AuthError> for ApiError {
    fn from(_value: AuthError) -> Self {
        // Don't pass on the value of the AuthError as a precautionary measure.
        // Being intentionally vague in public error communication hurts debugability
        // but it is more secure.
        ApiError::Forbidden("JWT authentication error".to_string())
    }
 }
 pub struct JwtAuth {
-    decoding_keys: Vec<DecodingKey>,
+    decoding_key: DecodingKey,
    validation: Validation,
 }
 impl JwtAuth {
-    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
+    pub fn new(decoding_key: DecodingKey) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_keys,
+            decoding_key,
            validation,
        }
    }
    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let metadata = key_path.metadata()?;
+        let public_key = fs::read(key_path)?;
-        let decoding_keys = if metadata.is_dir() {
+        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
            let mut keys = Vec::new();
            for entry in fs::read_dir(key_path)? {
                let path = entry?.path();
                if !path.is_file() {
                    // Ignore directories (don't recurse)
                    continue;
                }
                let public_key = fs::read(path)?;
                keys.push(DecodingKey::from_ed_pem(&public_key)?);
            }
            keys
        } else if metadata.is_file() {
            let public_key = fs::read(key_path)?;
            vec![DecodingKey::from_ed_pem(&public_key)?]
        } else {
            anyhow::bail!("path is neither a directory or a file")
        };
        if decoding_keys.is_empty() {
            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
        }
        Ok(Self::new(decoding_keys))
    }
-    /// Attempt to decode the token with the internal decoding keys.
+    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-    ///
+        Ok(decode(token, &self.decoding_key, &self.validation)?)
    /// The function tries the stored decoding keys in succession,
    /// and returns the first yielding a successful result.
    /// If there is no working decoding key, it returns the last error.
    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
        let mut res = None;
        for decoding_key in &self.decoding_keys {
            res = Some(decode(token, decoding_key, &self.validation));
            if let Some(Ok(res)) = res {
                return Ok(res);
            }
        }
        if let Some(res) = res {
            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
        } else {
            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
        }
    }
 }
@@ -184,9 +111,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;
    #[test]
-    fn test_decode() {
+    fn test_decode() -> Result<(), anyhow::Error> {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
            scope: Scope::Tenant,
        };
@@ -205,24 +132,28 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
        // Check it can be validated with the public key
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
+        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
        assert_eq!(claims_from_token, expected_claims);
        Ok(())
    }
    #[test]
-    fn test_encode() {
+    fn test_encode() -> Result<(), anyhow::Error> {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
            scope: Scope::Tenant,
        };
-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
        // decode it back
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let decoded = auth.decode(&encoded).unwrap();
+        let decoded = auth.decode(&encoded)?;
        assert_eq!(decoded.claims, claims);
        Ok(())
    }
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// See docs/rfcs/025-generation-numbers.md for detail on how generation
 /// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -1,41 +0,0 @@
 /// Useful type for asserting that expected bytes match reporting the bytes more readable
 /// array-syntax compatible hex bytes.
 ///
 /// # Usage
 ///
 /// ```
 /// use utils::Hex;
 ///
 /// let actual = serialize_something();
 /// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
 ///
 /// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
 /// // output suffixed with an array style length for easier comparisons.
 /// assert_eq!(Hex(&actual), Hex(&expected));
 ///
 /// // with `let expected = [0x68];` the error would had been:
 /// // assertion `left == right` failed
 /// //  left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
 #[derive(PartialEq)]
 pub struct Hex<'a>(pub &'a [u8]);
 impl std::fmt::Debug for Hex<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[")?;
        for (i, c) in self.0.chunks(16).enumerate() {
            if i > 0 && !c.is_empty() {
                writeln!(f, ", ")?;
            }
            for (j, b) in c.iter().enumerate() {
                if j > 0 {
                    write!(f, ", ")?;
                }
                write!(f, "0x{b:02x}")?;
            }
        }
        write!(f, "; {}]", self.0.len())
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{AuthError, Claims, SwappableJwtAuth};
+use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -14,11 +14,6 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::str::FromStr;
 use bytes::{Bytes, BytesMut};
 use std::io::Write as _;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -151,89 +146,94 @@ impl Drop for RequestCancelled {
    }
 }
 /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
 pub struct ChannelWriter {
    buffer: BytesMut,
    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
    written: usize,
 }
 impl ChannelWriter {
    pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
        assert_ne!(buf_len, 0);
        ChannelWriter {
            // split about half off the buffer from the start, because we flush depending on
            // capacity. first flush will come sooner than without this, but now resizes will
            // have better chance of picking up the "other" half. not guaranteed of course.
            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
            tx,
            written: 0,
        }
    }
    pub fn flush0(&mut self) -> std::io::Result<usize> {
        let n = self.buffer.len();
        if n == 0 {
            return Ok(0);
        }
        tracing::trace!(n, "flushing");
        let ready = self.buffer.split().freeze();
        // not ideal to call from blocking code to block_on, but we are sure that this
        // operation does not spawn_blocking other tasks
        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
            self.tx.send(Ok(ready)).await.map_err(|_| ())?;
            // throttle sending to allow reuse of our buffer in `write`.
            self.tx.reserve().await.map_err(|_| ())?;
            // now the response task has picked up the buffer and hopefully started
            // sending it to the client.
            Ok(())
        });
        if res.is_err() {
            return Err(std::io::ErrorKind::BrokenPipe.into());
        }
        self.written += n;
        Ok(n)
    }
    pub fn flushed_bytes(&self) -> usize {
        self.written
    }
 }
 impl std::io::Write for ChannelWriter {
    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
        let remaining = self.buffer.capacity() - self.buffer.len();
        let out_of_space = remaining < buf.len();
        let original_len = buf.len();
        if out_of_space {
            let can_still_fit = buf.len() - remaining;
            self.buffer.extend_from_slice(&buf[..can_still_fit]);
            buf = &buf[can_still_fit..];
            self.flush0()?;
        }
        // assume that this will often under normal operation just move the pointer back to the
        // beginning of allocation, because previous split off parts are already sent and
        // dropped.
        self.buffer.extend_from_slice(buf);
        Ok(original_len)
    }
    fn flush(&mut self) -> std::io::Result<()> {
        self.flush0().map(|_| ())
    }
 }
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    use bytes::{Bytes, BytesMut};
    use std::io::Write as _;
    use tokio::sync::mpsc;
    use tokio_stream::wrappers::ReceiverStream;
    SERVE_METRICS_COUNT.inc();
    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
    struct ChannelWriter {
        buffer: BytesMut,
        tx: mpsc::Sender<std::io::Result<Bytes>>,
        written: usize,
    }
    impl ChannelWriter {
        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
            assert_ne!(buf_len, 0);
            ChannelWriter {
                // split about half off the buffer from the start, because we flush depending on
                // capacity. first flush will come sooner than without this, but now resizes will
                // have better chance of picking up the "other" half. not guaranteed of course.
                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
                tx,
                written: 0,
            }
        }
        fn flush0(&mut self) -> std::io::Result<usize> {
            let n = self.buffer.len();
            if n == 0 {
                return Ok(0);
            }
            tracing::trace!(n, "flushing");
            let ready = self.buffer.split().freeze();
            // not ideal to call from blocking code to block_on, but we are sure that this
            // operation does not spawn_blocking other tasks
            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
                // throttle sending to allow reuse of our buffer in `write`.
                self.tx.reserve().await.map_err(|_| ())?;
                // now the response task has picked up the buffer and hopefully started
                // sending it to the client.
                Ok(())
            });
            if res.is_err() {
                return Err(std::io::ErrorKind::BrokenPipe.into());
            }
            self.written += n;
            Ok(n)
        }
        fn flushed_bytes(&self) -> usize {
            self.written
        }
    }
    impl std::io::Write for ChannelWriter {
        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
            let remaining = self.buffer.capacity() - self.buffer.len();
            let out_of_space = remaining < buf.len();
            let original_len = buf.len();
            if out_of_space {
                let can_still_fit = buf.len() - remaining;
                self.buffer.extend_from_slice(&buf[..can_still_fit]);
                buf = &buf[can_still_fit..];
                self.flush0()?;
            }
            // assume that this will often under normal operation just move the pointer back to the
            // beginning of allocation, because previous split off parts are already sent and
            // dropped.
            self.buffer.extend_from_slice(buf);
            Ok(original_len)
        }
        fn flush(&mut self) -> std::io::Result<()> {
            self.flush0().map(|_| ())
        }
    }
    let started_at = std::time::Instant::now();
    let (tx, rx) = mpsc::channel(1);
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }
 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,11 +400,9 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;
-                    let data = auth.decode(token).map_err(|err| {
+                    let data = auth
-                        warn!("Authentication error: {err}");
+                        .decode(token)
-                        // Rely on From<AuthError> for ApiError impl
+                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
                        err
                    })?;
                    req.set_context(data.claims);
                }
                None => {
@@ -452,11 +450,12 @@ where
 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
+    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => Ok(check_permission(&claims)
+        Some(claims) => {
-            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
+            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
        }
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info, warn};
+use tracing::{error, info};
 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,9 +118,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
    match api_error {
        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
            warn!("Error processing HTTP request: {api_error:#}")
        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,7 +3,6 @@ use std::{fmt, str::FromStr};
 use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::de::Visitor;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
@@ -18,74 +17,12 @@ pub enum IdError {
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+///
 /// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
 /// Check the `serde_with::serde_as` documentation for options for more complex types.
 #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
 struct Id([u8; 16]);
 impl Serialize for Id {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            self.0.serialize(serializer)
        }
    }
 }
 impl<'de> Deserialize<'de> for Id {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct IdVisitor {
            is_human_readable_deserializer: bool,
        }
        impl<'de> Visitor<'de> for IdVisitor {
            type Value = Id;
            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
                if self.is_human_readable_deserializer {
                    formatter.write_str("value in form of hex string")
                } else {
                    formatter.write_str("value in form of integer array([u8; 16])")
                }
            }
            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
            where
                A: serde::de::SeqAccess<'de>,
            {
                let s = serde::de::value::SeqAccessDeserializer::new(seq);
                let id: [u8; 16] = Deserialize::deserialize(s)?;
                Ok(Id::from(id))
            }
            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                Id::from_str(v).map_err(E::custom)
            }
        }
        if deserializer.is_human_readable() {
            deserializer.deserialize_str(IdVisitor {
                is_human_readable_deserializer: true,
            })
        } else {
            deserializer.deserialize_tuple(
                16,
                IdVisitor {
                    is_human_readable_deserializer: false,
                },
            )
        }
    }
 }
 impl Id {
    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
        let mut arr = [0u8; 16];
@@ -120,8 +57,6 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
@@ -373,112 +308,3 @@ impl fmt::Display for NodeId {
        write!(f, "{}", self.0)
    }
 }
 #[cfg(test)]
 mod tests {
    use serde_assert::{Deserializer, Serializer, Token, Tokens};
    use crate::bin_ser::BeSer;
    use super::*;
    #[test]
    fn test_id_serde_non_human_readable() {
        let original_id = Id([
            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
        ]);
        let expected_tokens = Tokens(vec![
            Token::Tuple { len: 16 },
            Token::U8(173),
            Token::U8(80),
            Token::U8(132),
            Token::U8(115),
            Token::U8(129),
            Token::U8(226),
            Token::U8(72),
            Token::U8(254),
            Token::U8(170),
            Token::U8(201),
            Token::U8(135),
            Token::U8(108),
            Token::U8(199),
            Token::U8(26),
            Token::U8(228),
            Token::U8(24),
            Token::TupleEnd,
        ]);
        let serializer = Serializer::builder().is_human_readable(false).build();
        let serialized_tokens = original_id.serialize(&serializer).unwrap();
        assert_eq!(serialized_tokens, expected_tokens);
        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
            .tokens(serialized_tokens)
            .build();
        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
        assert_eq!(deserialized_id, original_id);
    }
    #[test]
    fn test_id_serde_human_readable() {
        let original_id = Id([
            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
        ]);
        let expected_tokens = Tokens(vec![Token::Str(String::from(
            "ad50847381e248feaac9876cc71ae418",
        ))]);
        let serializer = Serializer::builder().is_human_readable(true).build();
        let serialized_tokens = original_id.serialize(&serializer).unwrap();
        assert_eq!(serialized_tokens, expected_tokens);
        let mut deserializer = Deserializer::builder()
            .is_human_readable(true)
            .tokens(Tokens(vec![Token::Str(String::from(
                "ad50847381e248feaac9876cc71ae418",
            ))]))
            .build();
        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
    }
    macro_rules! roundtrip_type {
        ($type:ty, $expected_bytes:expr) => {{
            let expected_bytes: [u8; 16] = $expected_bytes;
            let original_id = <$type>::from(expected_bytes);
            let ser_bytes = original_id.ser().unwrap();
            assert_eq!(ser_bytes, expected_bytes);
            let des_id = <$type>::des(&ser_bytes).unwrap();
            assert_eq!(des_id, original_id);
        }};
    }
    #[test]
    fn test_id_bincode_serde() {
        let expected_bytes = [
            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
        ];
        roundtrip_type!(Id, expected_bytes);
    }
    #[test]
    fn test_tenant_id_bincode_serde() {
        let expected_bytes = [
            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
        ];
        roundtrip_type!(TenantId, expected_bytes);
    }
    #[test]
    fn test_timeline_id_bincode_serde() {
        let expected_bytes = [
            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
        ];
        roundtrip_type!(TimelineId, expected_bytes);
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,5 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod backoff;
@@ -25,10 +24,6 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 mod hex;
 pub use hex::Hex;
 // http endpoint utils
 pub mod http;
@@ -78,11 +73,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;
 /// async timeout helper
 pub mod timeout;
 pub mod sync;
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
@@ -138,21 +128,6 @@ macro_rules! project_git_version {
    };
 }
 /// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
 #[macro_export]
 macro_rules! project_build_tag {
    ($const_identifier:ident) => {
        const $const_identifier: &::core::primitive::str = {
            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
                ::core::option::Option::Some(x) => ["build_tag-env:", x],
                ::core::option::Option::None => ["build_tag:", ""],
            };
            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
        };
    };
 }
 /// Re-export for `project_git_version` macro
 #[doc(hidden)]
 pub use const_format as __const_format;
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,7 +1,7 @@
 #![warn(missing_docs)]
 use camino::Utf8Path;
-use serde::{de::Visitor, Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
 use std::str::FromStr;
@@ -13,114 +13,10 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;
 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct Lsn(pub u64);
 impl Serialize for Lsn {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            self.0.serialize(serializer)
        }
    }
 }
 impl<'de> Deserialize<'de> for Lsn {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct LsnVisitor {
            is_human_readable_deserializer: bool,
        }
        impl<'de> Visitor<'de> for LsnVisitor {
            type Value = Lsn;
            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
                if self.is_human_readable_deserializer {
                    formatter.write_str(
                        "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
                    )
                } else {
                    formatter.write_str("value in form of integer(u64)")
                }
            }
            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                Ok(Lsn(v))
            }
            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                Lsn::from_str(v).map_err(|e| E::custom(e))
            }
        }
        if deserializer.is_human_readable() {
            deserializer.deserialize_str(LsnVisitor {
                is_human_readable_deserializer: true,
            })
        } else {
            deserializer.deserialize_u64(LsnVisitor {
                is_human_readable_deserializer: false,
            })
        }
    }
 }
 /// Allows (de)serialization of an `Lsn` always as `u64`.
 ///
 /// ### Example
 ///
 /// ```rust
 /// # use serde::{Serialize, Deserialize};
 /// use utils::lsn::Lsn;
 ///
 /// #[derive(PartialEq, Serialize, Deserialize, Debug)]
 /// struct Foo {
 ///   #[serde(with = "utils::lsn::serde_as_u64")]
 ///   always_u64: Lsn,
 /// }
 ///
 /// let orig = Foo { always_u64: Lsn(1234) };
 ///
 /// let res = serde_json::to_string(&orig).unwrap();
 /// assert_eq!(res, r#"{"always_u64":1234}"#);
 ///
 /// let foo = serde_json::from_str::<Foo>(&res).unwrap();
 /// assert_eq!(foo, orig);
 /// ```
 ///
 pub mod serde_as_u64 {
    use super::Lsn;
    /// Serializes the Lsn as u64 disregarding the human readability of the format.
    ///
    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
    pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
        use serde::Serialize;
        lsn.0.serialize(serializer)
    }
    /// Deserializes the Lsn as u64 disregarding the human readability of the format.
    ///
    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
    pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
        use serde::Deserialize;
        u64::deserialize(deserializer).map(Lsn)
    }
 }
 /// We tried to parse an LSN from a string, but failed
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
@@ -368,13 +264,8 @@ impl MonotonicCounter<Lsn> for RecordLsn {
 #[cfg(test)]
 mod tests {
    use crate::bin_ser::BeSer;
    use super::*;
    use serde::ser::Serialize;
    use serde_assert::{Deserializer, Serializer, Token, Tokens};
    #[test]
    fn test_lsn_strings() {
        assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -450,95 +341,4 @@ mod tests {
        assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
        assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
    }
    #[test]
    fn test_lsn_serde() {
        let original_lsn = Lsn(0x0123456789abcdef);
        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
        let expected_non_readable_tokens =
            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
        // Testing human_readable ser/de
        let serializer = Serializer::builder().is_human_readable(false).build();
        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
        assert_eq!(readable_ser_tokens, expected_readable_tokens);
        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
            .tokens(readable_ser_tokens)
            .build();
        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
        assert_eq!(des_lsn, original_lsn);
        // Testing NON human_readable ser/de
        let serializer = Serializer::builder().is_human_readable(true).build();
        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
        let mut deserializer = Deserializer::builder()
            .is_human_readable(true)
            .tokens(non_readable_ser_tokens)
            .build();
        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
        assert_eq!(des_lsn, original_lsn);
        // Testing mismatching ser/de
        let serializer = Serializer::builder().is_human_readable(false).build();
        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
        let mut deserializer = Deserializer::builder()
            .is_human_readable(true)
            .tokens(non_readable_ser_tokens)
            .build();
        Lsn::deserialize(&mut deserializer).unwrap_err();
        let serializer = Serializer::builder().is_human_readable(true).build();
        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
            .tokens(readable_ser_tokens)
            .build();
        Lsn::deserialize(&mut deserializer).unwrap_err();
    }
    #[test]
    fn test_lsn_ensure_roundtrip() {
        let original_lsn = Lsn(0xaaaabbbb);
        let serializer = Serializer::builder().is_human_readable(false).build();
        let ser_tokens = original_lsn.serialize(&serializer).unwrap();
        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
            .tokens(ser_tokens)
            .build();
        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
        assert_eq!(des_lsn, original_lsn);
    }
    #[test]
    fn test_lsn_bincode_serde() {
        let lsn = Lsn(0x0123456789abcdef);
        let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
        let ser_bytes = lsn.ser().unwrap();
        assert_eq!(ser_bytes, expected_bytes);
        let des_lsn = Lsn::des(&ser_bytes).unwrap();
        assert_eq!(des_lsn, lsn);
    }
    #[test]
    fn test_lsn_bincode_ensure_roundtrip() {
        let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
        let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
        let ser_bytes = original_lsn.ser().unwrap();
        assert_eq!(ser_bytes, expected_bytes);
        let des_lsn = Lsn::des(&ser_bytes).unwrap();
        assert_eq!(des_lsn, original_lsn);
    }
 }
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -3,6 +3,7 @@ use std::time::{Duration, SystemTime};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use tracing::{trace, warn};
 use crate::lsn::Lsn;
@@ -14,17 +15,21 @@ use crate::lsn::Lsn;
 ///
 /// serde Serialize is used only for human readable dump to json (e.g. in
 /// safekeepers debug_dump).
 #[serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PageserverFeedback {
    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    /// LSN last received and ingested by the pageserver. Controls backpressure.
    #[serde_as(as = "DisplayFromStr")]
    pub last_received_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver to its local disc.
    /// Controls backpressure.
    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
    /// consider WAL before it can be removed.
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,9 +125,6 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();
            // Block any future waiters from starting
            internal.shutdown = true;
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,7 +1,6 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) -> ! {
+pub fn exit_now(code: u8) {
    // SAFETY: exiting is safe, the ffi is not safe
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,3 +0,0 @@
 pub mod heavier_once_cell;
 pub mod gate;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -1,158 +0,0 @@
 use std::{sync::Arc, time::Duration};
 /// Gates are a concurrency helper, primarily used for implementing safe shutdown.
 ///
 /// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
 /// the resource calls `close()` when they want to ensure that all holders of guards
 /// have released them, and that no future guards will be issued.
 pub struct Gate {
    /// Each caller of enter() takes one unit from the semaphore. In close(), we
    /// take all the units to ensure all GateGuards are destroyed.
    sem: Arc<tokio::sync::Semaphore>,
    /// For observability only: a name that will be used to log warnings if a particular
    /// gate is holding up shutdown
    name: String,
 }
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
 pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
 /// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
 async fn warn_if_stuck<Fut: std::future::Future>(
    fut: Fut,
    name: &str,
    warn_period: std::time::Duration,
 ) -> <Fut as std::future::Future>::Output {
    let started = std::time::Instant::now();
    let mut fut = std::pin::pin!(fut);
    loop {
        match tokio::time::timeout(warn_period, &mut fut).await {
            Ok(ret) => return ret,
            Err(_) => {
                tracing::warn!(
                    gate = name,
                    elapsed_ms = started.elapsed().as_millis(),
                    "still waiting, taking longer than expected..."
                );
            }
        }
    }
 }
 #[derive(Debug)]
 pub enum GateError {
    GateClosed,
 }
 impl Gate {
    const MAX_UNITS: u32 = u32::MAX;
    pub fn new(name: String) -> Self {
        Self {
            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
            name,
        }
    }
    /// Acquire a guard that will prevent close() calls from completing. If close()
    /// was already called, this will return an error which should be interpreted
    /// as "shutting down".
    ///
    /// This function would typically be used from e.g. request handlers. While holding
    /// the guard returned from this function, it is important to respect a CancellationToken
    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
    /// also contain a CancellationToken.
    pub fn enter(&self) -> Result<GateGuard, GateError> {
        self.sem
            .clone()
            .try_acquire_owned()
            .map(GateGuard)
            .map_err(|_| GateError::GateClosed)
    }
    /// Types with a shutdown() method and a gate should call this method at the
    /// end of shutdown, to ensure that all GateGuard holders are done.
    ///
    /// This will wait for all guards to be destroyed.  For this to complete promptly, it is
    /// important that the holders of such guards are respecting a CancellationToken which has
    /// been cancelled before entering this function.
    pub async fn close(&self) {
        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
    }
    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
    /// the CancellationToken on such types is analogous to "Did shutdown start?"
    pub fn close_complete(&self) -> bool {
        self.sem.is_closed()
    }
    async fn do_close(&self) {
        tracing::debug!(gate = self.name, "Closing Gate...");
        match self.sem.acquire_many(Self::MAX_UNITS).await {
            Ok(_units) => {
                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
                self.sem.close();
            }
            Err(_) => {
                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
                // This is legal.  Timeline::shutdown for example is not protected from being called more than
                // once.
                tracing::debug!(gate = self.name, "Double close")
            }
        }
        tracing::debug!(gate = self.name, "Closed Gate.")
    }
 }
 #[cfg(test)]
 mod tests {
    use futures::FutureExt;
    use super::*;
    #[tokio::test]
    async fn test_idle_gate() {
        // Having taken no gates, we should not be blocked in close
        let gate = Gate::new("test".to_string());
        gate.close().await;
        // If a guard is dropped before entering, close should not be blocked
        let gate = Gate::new("test".to_string());
        let guard = gate.enter().unwrap();
        drop(guard);
        gate.close().await;
        // Entering a closed guard fails
        gate.enter().expect_err("enter should fail after close");
    }
    #[tokio::test]
    async fn test_busy_gate() {
        let gate = Gate::new("test".to_string());
        let guard = gate.enter().unwrap();
        let mut close_fut = std::pin::pin!(gate.close());
        // Close should be blocked
        assert!(close_fut.as_mut().now_or_never().is_none());
        // Attempting to enter() should fail, even though close isn't done yet.
        gate.enter()
            .expect_err("enter should fail after entering close");
        drop(guard);
        // Guard is gone, close should finish
        assert!(close_fut.as_mut().now_or_never().is_some());
        // Attempting to enter() is still forbidden
        gate.enter().expect_err("enter should fail finishing close");
    }
 }
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,383 +0,0 @@
 use std::sync::{
    atomic::{AtomicUsize, Ordering},
    Arc, Mutex, MutexGuard,
 };
 use tokio::sync::Semaphore;
 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
 /// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
 ///
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
    inner: Mutex<Inner<T>>,
    initializers: AtomicUsize,
 }
 impl<T> Default for OnceCell<T> {
    /// Create new uninitialized [`OnceCell`].
    fn default() -> Self {
        Self {
            inner: Default::default(),
            initializers: AtomicUsize::new(0),
        }
    }
 }
 /// Semaphore is the current state:
 /// - open semaphore means the value is `None`, not yet initialized
 /// - closed semaphore means the value has been initialized
 #[derive(Debug)]
 struct Inner<T> {
    init_semaphore: Arc<Semaphore>,
    value: Option<T>,
 }
 impl<T> Default for Inner<T> {
    fn default() -> Self {
        Self {
            init_semaphore: Arc::new(Semaphore::new(1)),
            value: None,
        }
    }
 }
 impl<T> OnceCell<T> {
    /// Creates an already initialized `OnceCell` with the given value.
    pub fn new(value: T) -> Self {
        let sem = Semaphore::new(1);
        sem.close();
        Self {
            inner: Mutex::new(Inner {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
            initializers: AtomicUsize::new(0),
        }
    }
    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
    /// returning the guard.
    ///
    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
    ///
    /// Initialization is panic-safe and cancellation-safe.
    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
    where
        F: FnOnce(InitPermit) -> Fut,
        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
    {
        let sem = {
            let guard = self.inner.lock().unwrap();
            if guard.value.is_some() {
                return Ok(Guard(guard));
            }
            guard.init_semaphore.clone()
        };
        let permit = {
            // increment the count for the duration of queued
            let _guard = CountWaitingInitializers::start(self);
            sem.acquire_owned().await
        };
        match permit {
            Ok(permit) => {
                let permit = InitPermit(permit);
                let (value, _permit) = factory(permit).await?;
                let guard = self.inner.lock().unwrap();
                Ok(Self::set0(value, guard))
            }
            Err(_closed) => {
                let guard = self.inner.lock().unwrap();
                assert!(
                    guard.value.is_some(),
                    "semaphore got closed, must be initialized"
                );
                return Ok(Guard(guard));
            }
        }
    }
    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
    /// # Panics
    ///
    /// If the inner has already been initialized.
    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
        let guard = self.inner.lock().unwrap();
        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
        // give more permits right now.
        if guard.init_semaphore.try_acquire().is_ok() {
            drop(guard);
            panic!("permit is of wrong origin");
        }
        Self::set0(value, guard)
    }
    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
        if guard.value.is_some() {
            drop(guard);
            unreachable!("we won permit, must not be initialized");
        }
        guard.value = Some(value);
        guard.init_semaphore.close();
        Guard(guard)
    }
    /// Returns a guard to an existing initialized value, if any.
    pub fn get(&self) -> Option<Guard<'_, T>> {
        let guard = self.inner.lock().unwrap();
        if guard.value.is_some() {
            Some(Guard(guard))
        } else {
            None
        }
    }
    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
    pub fn initializer_count(&self) -> usize {
        self.initializers.load(Ordering::Relaxed)
    }
 }
 /// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
 /// initializing task for example at the end of initialization.
 struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
 impl<'a, T> CountWaitingInitializers<'a, T> {
    fn start(target: &'a OnceCell<T>) -> Self {
        target.initializers.fetch_add(1, Ordering::Relaxed);
        CountWaitingInitializers(target)
    }
 }
 impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
    fn drop(&mut self) {
        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
    }
 }
 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
 /// initialized value.
 #[derive(Debug)]
 pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
 impl<T> std::ops::Deref for Guard<'_, T> {
    type Target = T;
    fn deref(&self) -> &Self::Target {
        self.0
            .value
            .as_ref()
            .expect("guard is not created unless value has been initialized")
    }
 }
 impl<T> std::ops::DerefMut for Guard<'_, T> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        self.0
            .value
            .as_mut()
            .expect("guard is not created unless value has been initialized")
    }
 }
 impl<'a, T> Guard<'a, T> {
    /// Take the current value, and a new permit for it's deinitialization.
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let permit = swapped
            .init_semaphore
            .clone()
            .try_acquire_owned()
            .expect("we just created this");
        std::mem::swap(&mut *self.0, &mut swapped);
        swapped
            .value
            .map(|v| (v, InitPermit(permit)))
            .expect("guard is not created unless value has been initialized")
    }
 }
 /// Type held by OnceCell (de)initializing task.
 pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::{
        convert::Infallible,
        sync::atomic::{AtomicUsize, Ordering},
        time::Duration,
    };
    #[tokio::test]
    async fn many_initializers() {
        #[derive(Default, Debug)]
        struct Counters {
            factory_got_to_run: AtomicUsize,
            future_polled: AtomicUsize,
            winners: AtomicUsize,
        }
        let initializers = 100;
        let cell = Arc::new(OnceCell::default());
        let counters = Arc::new(Counters::default());
        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
        let mut js = tokio::task::JoinSet::new();
        for i in 0..initializers {
            js.spawn({
                let cell = cell.clone();
                let counters = counters.clone();
                let barrier = barrier.clone();
                async move {
                    barrier.wait().await;
                    let won = {
                        let g = cell
                            .get_or_init(|permit| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
                                    Ok::<_, Infallible>((i, permit))
                                }
                            })
                            .await
                            .unwrap();
                        *g == i
                    };
                    if won {
                        counters.winners.fetch_add(1, Ordering::Relaxed);
                    }
                }
            });
        }
        barrier.wait().await;
        while let Some(next) = js.join_next().await {
            next.expect("no panics expected");
        }
        let mut counters = Arc::try_unwrap(counters).unwrap();
        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
        assert_eq!(*counters.future_polled.get_mut(), 1);
        assert_eq!(*counters.winners.get_mut(), 1);
    }
    #[tokio::test(start_paused = true)]
    async fn reinit_waits_for_deinit() {
        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
        let sleep_for = Duration::from_secs(1);
        let initial = 42;
        let reinit = 1;
        let cell = Arc::new(OnceCell::new(initial));
        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
        let jh = tokio::spawn({
            let cell = cell.clone();
            let deinitialization_started = deinitialization_started.clone();
            async move {
                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
                assert_eq!(answer, initial);
                deinitialization_started.wait().await;
                tokio::time::sleep(sleep_for).await;
            }
        });
        deinitialization_started.wait().await;
        let started_at = tokio::time::Instant::now();
        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
            .await
            .unwrap();
        let elapsed = started_at.elapsed();
        assert!(
            elapsed >= sleep_for,
            "initialization should had taken at least the time time slept with permit"
        );
        jh.await.unwrap();
        assert_eq!(*cell.get().unwrap(), reinit);
    }
    #[test]
    fn reinit_with_deinit_permit() {
        let cell = Arc::new(OnceCell::new(42));
        let (mol, permit) = cell.get().unwrap().take_and_deinit();
        cell.set(5, permit);
        assert_eq!(*cell.get().unwrap(), 5);
        let (five, permit) = cell.get().unwrap().take_and_deinit();
        assert_eq!(5, five);
        cell.set(mol, permit);
        assert_eq!(*cell.get().unwrap(), 42);
    }
    #[tokio::test]
    async fn initialization_attemptable_until_ok() {
        let cell = OnceCell::default();
        for _ in 0..10 {
            cell.get_or_init(|_permit| async { Err("whatever error") })
                .await
                .unwrap_err();
        }
        let g = cell
            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
    }
    #[tokio::test]
    async fn initialization_is_cancellation_safe() {
        let cell = OnceCell::default();
        let barrier = tokio::sync::Barrier::new(2);
        let initializer = cell.get_or_init(|permit| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;
            Ok::<_, Infallible>(("never reached", permit))
        });
        tokio::select! {
            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
            _ = barrier.wait() => {}
        };
        // now initializer is dropped
        assert!(cell.get().is_none());
        let g = cell
            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
    }
 }
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -1,37 +0,0 @@
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 pub enum TimeoutCancellableError {
    Timeout,
    Cancelled,
 }
 /// Wrap [`tokio::time::timeout`] with a CancellationToken.
 ///
 /// This wrapper is appropriate for any long running operation in a task
 /// that ought to respect a CancellationToken (which means most tasks).
 ///
 /// The only time you should use a bare tokio::timeout is when the future `F`
 /// itself respects a CancellationToken: otherwise, always use this wrapper
 /// with your CancellationToken to ensure that your task does not hold up
 /// graceful shutdown.
 pub async fn timeout_cancellable<F>(
    duration: Duration,
    cancel: &CancellationToken,
    future: F,
 ) -> Result<F::Output, TimeoutCancellableError>
 where
    F: std::future::Future,
 {
    tokio::select!(
        r = tokio::time::timeout(duration, future) => {
            r.map_err(|_| TimeoutCancellableError::Timeout)
        },
        _ = cancel.cancelled() => {
            Err(TimeoutCancellableError::Cancelled)
        }
    )
 }
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,12 +19,13 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio = { workspace = true, features = ["rt-multi-thread"] }
+tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,6 +21,11 @@ pub struct FileCacheState {
 #[derive(Debug)]
 pub struct FileCacheConfig {
    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
    /// memory available for the cgroup.
    pub(crate) in_memory: bool,
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }
-impl Default for FileCacheConfig {
+impl FileCacheConfig {
-    fn default() -> Self {
+    pub fn default_in_memory() -> Self {
        Self {
            in_memory: true,
            // 75 %
            resource_multiplier: 0.75,
            // 640 MiB; (512 + 128)
            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
            // ensure any increase in file cache size is split 90-10 with 10% to other memory
            spread_factor: 0.1,
        }
    }
    pub fn default_on_disk() -> Self {
        Self {
            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
            spread_factor: 0.1,
        }
    }
 }
 impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,5 +1,3 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]
 use anyhow::Context;
@@ -41,6 +39,16 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,
    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
    /// kernel's page cache), and therefore should not count against available memory.
    //
    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
    // during the switch away from an in-memory file cache, we had to default to the previous
    // behavior.
    #[arg(long)]
    pub file_cache_on_disk: bool,
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,7 +156,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = FileCacheConfig::default();
+            let config = match args.file_cache_on_disk {
                true => FileCacheConfig::default_on_disk(),
                false => FileCacheConfig::default_in_memory(),
            };
            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -184,7 +187,10 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }
-            file_cache_disk_size = actual_size;
+            if args.file_cache_on_disk {
                file_cache_disk_size = actual_size;
            }
            state.filecache = Some(file_cache);
        }
@@ -233,11 +239,17 @@ impl Runner {
        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_size = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .map(|file_cache| {
-            .unwrap_or(0);
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
                match file_cache.config.in_memory {
                    true => (size, 0),
                    false => (size, size),
                }
            })
            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();
@@ -261,7 +273,7 @@ impl Runner {
            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
            let current = last_history.avg_non_reclaimable;
@@ -288,10 +300,13 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
                file_cache_disk_size = actual_usage;
            }
            let message = format!(
-                "set file cache size to {} MiB",
+                "set file cache size to {} MiB (in memory = {})",
                bytes_to_mebibytes(actual_usage),
                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -342,7 +357,9 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
                file_cache_disk_size = actual_usage;
            }
            if actual_usage != expected_usage {
                warn!(
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,7 +188,6 @@ extern "C" fn recovery_download(
    }
 }
 #[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
    sk: *mut Safekeeper,
    buf: *mut ::std::os::raw::c_char,
@@ -422,7 +421,6 @@ impl std::fmt::Display for Level {
 }
 /// Take ownership of `Vec<u8>` from StringInfoData.
 #[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
    if pg.data.is_null() {
        return None;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
            .unwrap()
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,21 +1,22 @@
-use utils::auth::{AuthError, Claims, Scope};
+use anyhow::{bail, Result};
 use utils::auth::{Claims, Scope};
 use utils::id::TenantId;
-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => Err(AuthError(
+        (Scope::Tenant, None) => {
-            "Attempt to access management api with tenant scope. Permission denied".into(),
+            bail!("Attempt to access management api with tenant scope. Permission denied")
-        )),
+        }
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
+                bail!("Tenant id mismatch. Permission denied")
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => Err(AuthError(
+        (Scope::SafekeeperData, _) => {
-            "SafekeeperData scope makes no sense for Pageserver".into(),
+            bail!("SafekeeperData scope makes no sense for Pageserver")
-        )),
+        }
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,15 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::{JwtAuth, SwappableJwtAuth},
+    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
    logging, project_build_tag, project_git_version,
    sentry_init::init_sentry,
    signals::Signal,
    tcp_listener,
 };
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 const PID_FILE_NAME: &str = "pageserver.pid";
@@ -262,12 +258,11 @@ fn start_pageserver(
    // A changed version string indicates changed software.
    // A changed launch timestamp indicates a pageserver restart.
    info!(
-        "version: {} launch_timestamp: {} build_tag: {}",
+        "version: {} launch_timestamp: {}",
        version(),
-        launch_ts.to_string(),
+        launch_ts.to_string()
        BUILD_TAG,
    );
-    set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    set_build_info_metric(GIT_VERSION);
    set_launch_timestamp_metric(launch_ts);
    pageserver::preinitialize_metrics();
@@ -324,12 +319,13 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and exists
+        // unwrap is ok because check is performed when creating config, so path is set and file exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+        info!(
-
+            "Loading public key for verifying JWT tokens from {:#?}",
-        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+            key_path
-        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
+        );
        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -412,7 +408,7 @@ fn start_pageserver(
    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -422,7 +418,6 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
    let tenant_manager = Arc::new(tenant_manager);
    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -551,7 +546,6 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,7 +33,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
@@ -161,7 +162,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
+    /// Path to a file containing public key for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,
@@ -632,6 +633,11 @@ impl PageServerConf {
        self.tenants_path().join(tenant_id.to_string())
    }
    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id)
            .join(TENANT_ATTACHING_MARKER_FILENAME)
    }
    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }
@@ -1314,6 +1320,12 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
                    max_concurrent_syncs: NonZeroUsize::new(
                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
                    )
                        .unwrap(),
                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1374,6 +1386,8 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
                    max_concurrent_syncs,
                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
@@ -1465,6 +1479,8 @@ threshold = "20m"
            Some(DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(80).unwrap(),
                min_avail_bytes: 0,
                target_avail_bytes: None,
                target_usage_pct: None,
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }
-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -3,6 +3,7 @@ use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
 use serde_with::serde_as;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -41,10 +42,13 @@ pub(super) enum Name {
 ///
 /// This is a denormalization done at the MetricsKey const methods; these should not be constructed
 /// elsewhere.
 #[serde_with::serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(crate) struct MetricsKey {
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,
    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,
@@ -202,6 +206,7 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,5 @@
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
 use serde_with::serde_as;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -6,9 +7,12 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
 use utils::id::{TenantId, TimelineId};
 /// How the metrics from pageserver are identified.
 #[serde_with::serde_as]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
 struct Ids {
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,
    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,10 +57,7 @@ impl ControlPlaneClient {
        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert(
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
                "Authorization",
                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
            );
            client = client.default_headers(headers);
        }
@@ -147,7 +144,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.gen)))
+            .map(|t| (t.id, Generation::new(t.generation)))
            .collect::<HashMap<_, _>>())
    }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
 use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -18,6 +17,7 @@ use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
 use serde_with::serde_as;
 use thiserror::Error;
 use tokio;
 use tokio_util::sync::CancellationToken;
@@ -214,6 +214,7 @@ where
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";
 #[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionList {
    /// Serialization version, for future use
@@ -242,6 +243,7 @@ struct DeletionList {
    validated: bool,
 }
 #[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionHeader {
    /// Serialization version, for future use
@@ -269,9 +271,7 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .maybe_fatal_err("save deletion header")?;
+            .map_err(Into::into)
        Ok(())
    }
 }
@@ -360,7 +360,6 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
@@ -893,6 +892,14 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
            max_concurrent_syncs: std::num::NonZeroUsize::new(
                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
            )
            .unwrap(),
            max_sync_errors: std::num::NonZeroU32::new(
                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
            )
            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,24 +55,21 @@ impl Deleter {
    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
        fail::fail_point!("deletion-queue-before-execute", |_| {
            info!("Skipping execution, failpoint set");
            metrics::DELETION_QUEUE
                .remote_errors
                .with_label_values(&["failpoint"])
                .inc();
            Err(anyhow::anyhow!("failpoint hit"))
        });
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async {
+            || async { self.remote_storage.delete_objects(&self.accumulator).await },
                fail::fail_point!("deletion-queue-before-execute", |_| {
                    info!("Skipping execution, failpoint set");
                    metrics::DELETION_QUEUE
                        .remote_errors
                        .with_label_values(&["failpoint"])
                        .inc();
                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                });
                self.remote_storage.delete_objects(&self.accumulator).await
            },
            |_| false,
            3,
            10,
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -197,7 +195,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    on_fatal_io_error(&e, "reading deletion header");
+                    Err(anyhow::anyhow!(e))
                }
            }
        }
@@ -218,9 +216,16 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;
        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = tokio::fs::read_dir(&deletion_directory)
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
-            .await
+            Ok(d) => d,
-            .fatal_err("read deletion directory");
+            Err(e) => {
                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
                // Give up: if we can't read the deletion list directory, we probably can't
                // write lists into it later, so the queue won't work.
                return Err(e.into());
            }
        };
        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -228,7 +233,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
+        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();
@@ -241,9 +246,11 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                tokio::fs::remove_file(&absolute_path)
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
-                    .await
+                    // Non-fatal error: we will just leave the file behind but not
-                    .fatal_err("delete temp file");
+                    // try and load it.
                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
                }
                continue;
            }
@@ -283,9 +290,7 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);
-            let list_bytes = tokio::fs::read(&list_path)
+            let list_bytes = tokio::fs::read(&list_path).await?;
                .await
                .fatal_err("read deletion list");
            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
@@ -344,7 +349,7 @@ impl ListWriter {
        info!("Started deletion frontend worker");
        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
 use crate::virtual_file::MaybeFatalIo;
 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -288,9 +287,16 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-            tokio::fs::remove_file(&list_path)
+
-                .await
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                .fatal_err("remove deletion list");
+                // Unexpected: we should have permissions and nothing else should
                // be touching these files.  We will leave the file behind.  Subsequent
                // pageservers will try and load it again: hopefully whatever storage
                // issue (probably permissions) has been fixed by then.
                tracing::error!("Failed to delete {list_path}: {e:#}");
                metrics::DELETION_QUEUE.unexpected_errors.inc();
                break;
            }
        }
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,27 +60,47 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{
+    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
        self,
        storage_layer::{AsLayerDesc, EvictionError, Layer},
        Timeline,
    },
 };
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct DiskUsageEvictionTaskConfig {
    pub max_usage_pct: Percent,
    pub min_avail_bytes: u64,
    // Control how far we will go when evicting: when usage exceeds max_usage_pct or min_avail_bytes,
    // we will keep evicting layers until we reach the target.  The resulting disk usage should look
    // like a sawtooth bouncing between the upper max/min line and the lower target line.
    #[serde(default)]
    pub target_usage_pct: Option<Percent>,
    #[serde(default)]
    pub target_avail_bytes: Option<u64>,
    #[serde(with = "humantime_serde")]
    pub period: Duration,
    #[cfg(feature = "testing")]
    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
 }
 #[derive(Default)]
 enum Status {
    /// We are within disk limits, and not currently doing any eviction
    #[default]
    Idle,
    /// Disk limits have been exceeded: we will evict soon
    UnderPressure,
    /// We are currently doing an eviction pass.
    Evicting,
 }
 #[derive(Default)]
 pub struct State {
    /// Exclude http requests and background task from running at the same time.
    mutex: tokio::sync::Mutex<()>,
    /// Publish the current status of eviction work, for visibility to other subsystems
    /// that modify their behavior if disk pressure is high or if eviction is going on.
    status: std::sync::RwLock<Status>,
 }
 pub fn launch_disk_usage_global_eviction_task(
@@ -112,7 +132,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };
-            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -125,7 +145,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -149,8 +169,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();
        async {
-            let res =
+            let res = disk_usage_eviction_task_iteration(
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+                state,
                task_config,
                &storage,
                tenants_dir,
                &cancel,
            )
            .await;
            match res {
                Ok(()) => {}
@@ -174,25 +200,34 @@ async fn disk_usage_eviction_task(
 }
 pub trait Usage: Clone + Copy + std::fmt::Debug {
-    fn has_pressure(&self) -> bool;
+    fn pressure(&self) -> f64;
    fn over_pressure(&self) -> bool;
    fn no_pressure(&self) -> bool;
    fn add_available_bytes(&mut self, bytes: u64);
 }
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+
    if usage_pre.over_pressure() {
        *state.status.write().unwrap() = Status::Evicting;
    }
    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
-            match outcome {
+            let new_status = match outcome {
                IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
                    // nothing to do, select statement below will handle things
                    Status::Idle
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
@@ -202,21 +237,30 @@ async fn disk_usage_eviction_task_iteration(
                    debug!(?after, "disk usage");
-                    if after.has_pressure() {
+                    if after.over_pressure() {
                        // Don't bother doing an out-of-order iteration here now.
                        // In practice, the task period is set to a value in the tens-of-seconds range,
                        // which will cause another iteration to happen soon enough.
                        // TODO: deltas between the three different usages would be helpful,
                        // consider MiB, GiB, TiB
                        warn!(?outcome, ?after, "disk usage still high");
                        Status::UnderPressure
                    } else {
                        info!(?outcome, ?after, "disk usage pressure relieved");
                        Status::Idle
                    }
                }
-            }
+            };
            *state.status.write().unwrap() = new_status;
        }
        Err(e) => {
            error!("disk_usage_eviction_iteration failed: {:#}", e);
            *state.status.write().unwrap() = if usage_pre.over_pressure() {
                Status::UnderPressure
            } else {
                Status::Idle
            };
        }
    }
@@ -270,6 +314,7 @@ struct LayerCount {
 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -281,8 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    debug!(?usage_pre, "disk usage");
-    if !usage_pre.has_pressure() {
+    if !usage_pre.over_pressure() {
        return Ok(IterationOutcome::NoPressure);
    } else {
        *state.status.write().unwrap() = Status::Evicting;
    }
    warn!(
@@ -326,12 +373,11 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
-        if !usage_planned.has_pressure() {
+        if usage_planned.no_pressure() {
            debug!(
                no_candidates_evicted = i,
                "took enough candidates for pressure to be relieved"
@@ -346,18 +392,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
-        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
+        batched
-        // tasks to evict all seen layers until we have evicted enough
+            .entry(TimelineKey(candidate.timeline))
-
+            .or_default()
-        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+            .push(candidate.layer);
        // semaphore will later be used to limit eviction concurrency, and we can express at
        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
        // but fail gracefully by not making batches larger.
        if batch.len() < u32::MAX as usize {
            batch.push(candidate.layer);
            max_batch_size = max_batch_size.max(batch.len());
        }
    }
    let usage_planned = match warned {
@@ -374,101 +412,69 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // phase2: evict victims batched by timeline
-    let mut js = tokio::task::JoinSet::new();
+    // After the loop, `usage_assumed` is the post-eviction usage,
-
+    // according to internal accounting.
-    // ratelimit to 1k files or any higher max batch size
+    let mut usage_assumed = usage_pre;
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+    let mut evictions_failed = LayerCount::default();
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size =
+        let batch_size = batch.len();
            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
        // I dislike naming of `available_permits` but it means current total amount of permits
        // because permits can be added
        assert!(batch_size as usize <= limit.available_permits());
        debug!(%timeline_id, "evicting batch for timeline");
-        let evict = {
+        async {
-            let limit = limit.clone();
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
            let cancel = cancel.clone();
            async move {
                let mut evicted_bytes = 0;
                let mut evictions_failed = LayerCount::default();
-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+            match results {
-                    // semaphore closing means cancelled
+                Err(e) => {
-                    return (evicted_bytes, evictions_failed);
+                    warn!("failed to evict batch: {:#}", e);
-                };
+                }
-
+                Ok(results) => {
-                let results = timeline.evict_layers(&batch).await;
+                    assert_eq!(results.len(), batch.len());
-
+                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                match results {
+                        let file_size = layer.layer_desc().file_size;
-                    Ok(results) => {
+                        match result {
-                        assert_eq!(results.len(), batch.len());
+                            Some(Ok(())) => {
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                                usage_assumed.add_available_bytes(file_size);
-                            let file_size = layer.layer_desc().file_size;
+                            }
-                            match result {
+                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                Some(Ok(())) => {
+                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                                    evicted_bytes += file_size;
+                            }
-                                }
+                            Some(Err(EvictionError::FileNotFound)) => {
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                evictions_failed.file_sizes += file_size;
-                                    evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
-                                    evictions_failed.count += 1;
+                            }
-                                }
+                            Some(Err(
-                                None => {
+                                e @ EvictionError::LayerNotFound(_)
-                                    assert!(cancel.is_cancelled());
+                                | e @ EvictionError::StatFailed(_),
-                                }
+                            )) => {
                                let e = utils::error::report_compact_sources(&e);
                                warn!(%layer, "failed to evict layer: {e}");
                                evictions_failed.file_sizes += file_size;
                                evictions_failed.count += 1;
                            }
                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
                                warn!(%layer, "failed to evict layer: {detail}");
                                evictions_failed.file_sizes += file_size;
                                evictions_failed.count += 1;
                            }
                            None => {
                                assert!(cancel.is_cancelled());
                                return;
                            }
                        }
                    }
                    Err(e) => {
                        warn!("failed to evict batch: {:#}", e);
                    }
                }
                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
        .await;
-        js.spawn(evict);
+        if cancel.is_cancelled() {
        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
        // chance of making progress
        tokio::task::yield_now().await;
    }
    let join_all = async move {
        // After the evictions, `usage_assumed` is the post-eviction usage,
        // according to internal accounting.
        let mut usage_assumed = usage_pre;
        let mut evictions_failed = LayerCount::default();
        while let Some(res) = js.join_next().await {
            match res {
                Ok((evicted_bytes, failed)) => {
                    usage_assumed.add_available_bytes(evicted_bytes);
                    evictions_failed.file_sizes += failed.file_sizes;
                    evictions_failed.count += failed.count;
                }
                Err(je) if je.is_cancelled() => unreachable!("not used"),
                Err(je) if je.is_panic() => { /* already logged */ }
                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
            }
        }
        (usage_assumed, evictions_failed)
    };
    let (usage_assumed, evictions_failed) = tokio::select! {
        tuple = join_all => { tuple },
        _ = cancel.cancelled() => {
            // close the semaphore to stop any pending acquires
            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    };
+    }
    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -483,7 +489,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Layer,
+    layer: Arc<dyn PersistentLayer>,
    last_activity_ts: SystemTime,
 }
@@ -545,7 +551,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -554,11 +560,6 @@ async fn collect_eviction_candidates(
            }
        };
        if tenant.cancel.is_cancelled() {
            info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
            continue;
        }
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
@@ -686,22 +687,57 @@ mod filesystem_level_usage {
    }
    impl super::Usage for Usage<'_> {
-        fn has_pressure(&self) -> bool {
+        /// Does the pressure exceed 1.0, i.e. has the disk usage exceeded upper bounds?
-            let usage_pct =
+        ///
-                (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
+        /// This is the condition for starting eviction.
        fn over_pressure(&self) -> bool {
            self.pressure() >= 1.0
        }
-            let pressures = [
+        /// Is the pressure <0, ie.. has disk usage gone below the target bound?
-                (
+        ///
-                    "min_avail_bytes",
+        /// This is the condition for dropping out of eviction.
-                    self.avail_bytes < self.config.min_avail_bytes,
+        fn no_pressure(&self) -> bool {
-                ),
+            self.pressure() <= 0.0
-                (
+        }
                    "max_usage_pct",
                    usage_pct >= self.config.max_usage_pct.get() as u64,
                ),
            ];
-            pressures.into_iter().any(|(_, has_pressure)| has_pressure)
+        fn pressure(&self) -> f64 {
            let max_usage = std::cmp::min(
                self.total_bytes - self.config.min_avail_bytes,
                (self.total_bytes as f64 * (self.config.max_usage_pct.get() as f64 / 100.0)) as u64,
            );
            let mut target_usage = max_usage;
            if let Some(target_avail_bytes) = self.config.target_avail_bytes {
                target_usage = std::cmp::min(target_usage, self.total_bytes - target_avail_bytes);
            }
            if let Some(target_usage_pct) = self.config.target_usage_pct {
                target_usage = std::cmp::min(
                    target_usage,
                    (self.total_bytes as f64 * (target_usage_pct.get() as f64 / 100.0)) as u64,
                );
            };
            let usage = self.total_bytes - self.avail_bytes;
            eprintln!(
                "pressure: {} {}, current {}",
                target_usage, max_usage, usage
            );
            if target_usage == max_usage {
                // We are configured with a zero sized range: treat anything at+beyond limit as pressure 1.0, else 0.0
                if usage >= max_usage {
                    1.0
                } else {
                    0.0
                }
            } else if usage <= target_usage {
                // No pressure.
                0.0
            } else {
                // We are above target: pressure is the ratio of how much we exceed target to the size of the gap
                let range_size = (max_usage - target_usage) as f64;
                (usage - target_usage) as f64 / range_size
            }
        }
        fn add_available_bytes(&mut self, bytes: u64) {
@@ -755,6 +791,8 @@ mod filesystem_level_usage {
            config: &DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(85).unwrap(),
                min_avail_bytes: 0,
                target_avail_bytes: None,
                target_usage_pct: None,
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
@@ -763,24 +801,24 @@ mod filesystem_level_usage {
            avail_bytes: 0,
        };
-        assert!(usage.has_pressure(), "expected pressure at 100%");
+        assert!(usage.over_pressure(), "expected pressure at 100%");
        usage.add_available_bytes(14_000);
-        assert!(usage.has_pressure(), "expected pressure at 86%");
+        assert!(usage.over_pressure(), "expected pressure at 86%");
        usage.add_available_bytes(999);
-        assert!(usage.has_pressure(), "expected pressure at 85.001%");
+        assert!(usage.over_pressure(), "expected pressure at 85.001%");
        usage.add_available_bytes(1);
-        assert!(usage.has_pressure(), "expected pressure at precisely 85%");
+        assert!(usage.over_pressure(), "expected pressure at precisely 85%");
        usage.add_available_bytes(1);
-        assert!(!usage.has_pressure(), "no pressure at 84.999%");
+        assert!(!usage.over_pressure(), "no pressure at 84.999%");
        usage.add_available_bytes(999);
-        assert!(!usage.has_pressure(), "no pressure at 84%");
+        assert!(!usage.over_pressure(), "no pressure at 84%");
        usage.add_available_bytes(16_000);
-        assert!(!usage.has_pressure());
+        assert!(!usage.over_pressure());
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,31 +52,6 @@ paths:
              schema:
                type: object
  /v1/reload_auth_validation_keys:
    post:
      description: Reloads the JWT public keys from their pre-configured location on disk.
      responses:
        "200":
          description: The reload completed successfully.
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error (also hits if no keys were found)
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -352,8 +327,7 @@ paths:
          in: query
          required: true
          schema:
-            type: string
+            type: integer
            format: hex
          description: A LSN to get the timestamp
      responses:
        "200":
@@ -418,19 +392,13 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
        - name: version
          in: query
          required: false
          schema:
            type: integer
          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                $ref: "#/components/schemas/LsnByTimestampResponse"
+                type: string
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -595,17 +563,7 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: |
+          description: Tenant download is already in progress
            The tenant is already known to Pageserver in some way,
            and hence this `/attach` call has been rejected.
            Some examples of how this can happen:
            - tenant was created on this pageserver
            - tenant attachment was started by an earlier call to `/attach`.
            Callers should poll the tenant status's `attachment_status` field,
            like for status 202. See the longer description for `POST /attach`
            for details.
          content:
            application/json:
              schema:
@@ -749,12 +707,6 @@ paths:
        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
      requestBody:
        required: false
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/TenantLoadRequest"
      responses:
        "202":
          description: Tenant scheduled to load successfully
@@ -1245,15 +1197,6 @@ components:
            new_tenant_id:
              type: string
              format: hex
            generation:
              type: integer
              description: Attachment generation number.
    TenantLoadRequest:
      type: object
      properties:
        generation:
          type: integer
          description: Attachment generation number.
    TenantAttachRequest:
      type: object
      required:
@@ -1441,19 +1384,6 @@ components:
          type: string
          format: hex
    LsnByTimestampResponse:
      type: object
      required:
        - lsn
        - kind
      properties:
        lsn:
          type: string
          format: hex
        kind:
          type: string
          enum: [past, present, future, nodata]
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,7 +8,7 @@ use std::sync::Arc;
 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
-use hyper::header;
+use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -20,7 +20,6 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -36,8 +35,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -46,7 +44,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::SwappableJwtAuth,
+    auth::JwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -64,8 +62,7 @@ use super::models::ConfigureFailpointsRequest;
 pub struct State {
    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<JwtAuth>>,
    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -76,8 +73,7 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        tenant_manager: Arc<TenantManager>,
+        auth: Option<Arc<JwtAuth>>,
        auth: Option<Arc<SwappableJwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -89,7 +85,6 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -151,59 +146,28 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::SlotError(e) => e.into(),
+            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-            TenantMapInsertError::SlotUpsertError(e) => e.into(),
+                ApiError::ResourceUnavailable(format!("{tmie}").into())
            }
            TenantMapInsertError::TenantAlreadyExists(id, state) => {
                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
            }
            TenantMapInsertError::TenantExistsSecondary(id) => {
                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
            }
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }
 impl From<TenantSlotError> for ApiError {
    fn from(e: TenantSlotError) -> ApiError {
        use TenantSlotError::*;
        match e {
            NotFound(tenant_id) => {
                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
            }
            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
            InProgress => {
                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
            }
            MapState(e) => e.into(),
        }
    }
 }
 impl From<TenantSlotUpsertError> for ApiError {
    fn from(e: TenantSlotUpsertError) -> ApiError {
        use TenantSlotUpsertError::*;
        match e {
            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
            MapState(e) => e.into(),
        }
    }
 }
 impl From<TenantMapError> for ApiError {
    fn from(e: TenantMapError) -> ApiError {
        use TenantMapError::*;
        match e {
            StillInitializing | ShuttingDown => {
                ApiError::ResourceUnavailable(format!("{e}").into())
            }
        }
    }
 }
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            TenantStateError::SlotError(e) => e.into(),
+            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
            TenantStateError::SlotUpsertError(e) => e.into(),
            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
        }
    }
 }
@@ -224,7 +188,6 @@ impl From<GetTenantError> for ApiError {
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
        }
    }
 }
@@ -279,9 +242,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            Get(g) => ApiError::from(g),
            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
            SlotError(e) => e.into(),
            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -303,7 +263,11 @@ async fn build_timeline_info(
        // we're executing this function, we will outlive the timeline on-disk state.
        info.current_logical_size_non_incremental = Some(
            timeline
-                .get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
+                .get_current_logical_size_non_incremental(
                    info.last_record_lsn,
                    CancellationToken::new(),
                    ctx,
                )
                .await?,
        );
    }
@@ -389,32 +353,6 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }
 async fn reload_auth_validation_keys_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
    let config = get_config(&request);
    let state = get_state(&request);
    let Some(shared_auth) = &state.auth else {
        return json_response(StatusCode::BAD_REQUEST, ());
    };
    // unwrap is ok because check is performed when creating config, so path is set and exists
    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
    match JwtAuth::from_key_path(key_path) {
        Ok(new_auth) => {
            shared_auth.swap(new_auth);
            json_response(StatusCode::OK, ())
        }
        Err(e) => {
            warn!("Error reloading public keys from {key_path:?}: {e:}");
            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
        }
    }
 }
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -430,7 +368,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);
    async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -458,9 +396,6 @@ async fn timeline_create_handler(
            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
            }
            Err(tenant::CreateTimelineError::ShuttingDown) => {
                json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -480,7 +415,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        let timelines = tenant.list_timelines();
        let mut response_data = Vec::with_capacity(timelines.len());
@@ -519,7 +454,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -549,8 +484,6 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let version: Option<u8> = parse_query_param(&request, "version")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -562,30 +495,13 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
-    if version.unwrap_or(0) > 1 {
+    let result = match result {
-        #[derive(serde::Serialize)]
+        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-        struct Result {
+        LsnForTimestamp::Future(_lsn) => "future".into(),
-            lsn: Lsn,
+        LsnForTimestamp::Past(_lsn) => "past".into(),
-            kind: &'static str,
+        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-        }
+    };
-        let (lsn, kind) = match result {
+    json_response(StatusCode::OK, result)
            LsnForTimestamp::Present(lsn) => (lsn, "present"),
            LsnForTimestamp::Future(lsn) => (lsn, "future"),
            LsnForTimestamp::Past(lsn) => (lsn, "past"),
            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
        };
        json_response(StatusCode::OK, Result { lsn, kind })
    } else {
        // FIXME: this is a temporary crutch not to break backwards compatibility
        // See https://github.com/neondatabase/neon/pull/5608
        let result = match result {
            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
            LsnForTimestamp::Future(_lsn) => "future".into(),
            LsnForTimestamp::Past(_lsn) => "past".into(),
            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
        };
        json_response(StatusCode::OK, result)
    }
 }
 async fn get_timestamp_of_lsn_handler(
@@ -775,7 +691,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;
    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -838,7 +754,7 @@ async fn tenant_size_handler(
    let headers = request.headers();
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    // this can be long operation
    let inputs = tenant
@@ -851,10 +767,6 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;
    let mut sizes = None;
    let accepts_html = headers
        .get(header::ACCEPT)
        .map(|v| v == "text/html")
        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -862,19 +774,21 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();
        // If request header expects html, return html
-        if accepts_html {
+        if headers["Accept"] == "text/html" {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if accepts_html {
+    } else if headers["Accept"] == "text/html" {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
    }
    /// The type resides in the pageserver not to expose `ModelInputs`.
    #[serde_with::serde_as]
    #[derive(serde::Serialize)]
    struct TenantHistorySize {
        #[serde_as(as = "serde_with::DisplayFromStr")]
        id: TenantId,
        /// Size is a mixture of WAL and logical size, so the unit is bytes.
        ///
@@ -1015,7 +929,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(header::CONTENT_TYPE, "text/html")
+        .header(hyper::header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1095,7 +1009,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;
    let response = HashMap::from([
        (
@@ -1154,7 +1068,7 @@ async fn put_tenant_location_config_handler(
            .await
        {
            match e {
-                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
+                TenantStateError::NotFound(_) => {
                    // This API is idempotent: a NotFound on a detach is fine.
                }
                _ => return Err(e.into()),
@@ -1166,14 +1080,20 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
-    state
+    mgr::upsert_location(
-        .tenant_manager
+        state.conf,
-        .upsert_location(tenant_id, location_conf, &ctx)
+        tenant_id,
-        .await
+        location_conf,
-        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        state.broker_client.clone(),
-        // principle we might have hit something like concurrent API calls to the same tenant,
+        state.remote_storage.clone(),
-        // which is not a 400 but a 409.
+        state.deletion_queue_client.clone(),
-        .map_err(ApiError::BadRequest)?;
+        &ctx,
    )
    .await
    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
    // principle we might have hit something like concurrent API calls to the same tenant,
    // which is not a 400 but a 409.
    .map_err(ApiError::BadRequest)?;
    json_response(StatusCode::OK, ())
 }
@@ -1186,6 +1106,7 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
    tenant.set_broken("broken from test".to_owned()).await;
@@ -1258,7 +1179,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1283,7 +1204,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
@@ -1389,7 +1310,7 @@ async fn getpage_at_lsn_handler(
        Result::<_, ApiError>::Ok(
            Response::builder()
                .status(StatusCode::OK)
-                .header(header::CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_TYPE, "application/octet-stream")
                .body(hyper::Body::from(page))
                .unwrap(),
        )
@@ -1490,7 +1411,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1531,10 +1452,22 @@ async fn disk_usage_eviction_run(
    }
    impl crate::disk_usage_eviction_task::Usage for Usage {
-        fn has_pressure(&self) -> bool {
+        fn over_pressure(&self) -> bool {
            self.config.evict_bytes > self.freed_bytes
        }
        fn no_pressure(&self) -> bool {
            !self.over_pressure()
        }
        fn pressure(&self) -> f64 {
            if self.over_pressure() {
                1.0
            } else {
                0.0
            }
        }
        fn add_available_bytes(&mut self, bytes: u64) {
            self.freed_bytes += bytes;
        }
@@ -1553,11 +1486,11 @@ async fn disk_usage_eviction_run(
    let state = get_state(&r);
-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    }
+    };
    let state = state.disk_usage_eviction_state.clone();
@@ -1575,6 +1508,7 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
                &storage,
                usage,
                &child_cancel,
            )
@@ -1667,8 +1601,6 @@ where
        );
        match handle.await {
            // TODO: never actually return Err from here, always Ok(...) so that we can log
            // spanned errors. Call api_error_handler instead and return appropriate Body.
            Ok(result) => result,
            Err(e) => {
                // The handler task panicked. We have a global panic handler that logs the
@@ -1717,7 +1649,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1746,9 +1678,6 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
        .post("/v1/reload_auth_validation_keys", |r| {
            api_handler(r, reload_auth_validation_keys_handler)
        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,3 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 mod auth;
 pub mod basebackup;
 pub mod config;
@@ -63,6 +61,14 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;
    // Shut down any page service tasks.
    timed(
        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
    .await;
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
@@ -72,15 +78,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
    .await;
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    if let Some(mut deletion_queue) = deletion_queue {
        deletion_queue.shutdown(Duration::from_secs(5)).await;
@@ -152,10 +149,6 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }
 // FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
 // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
 // from the name.
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -962,32 +962,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
    .expect("failed to define a metric")
 });
 pub(crate) struct TenantManagerMetrics {
    pub(crate) tenant_slots: UIntGauge,
    pub(crate) tenant_slot_writes: IntCounter,
    pub(crate) unexpected_errors: IntCounter,
 }
 pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
    TenantManagerMetrics {
    tenant_slots: register_uint_gauge!(
        "pageserver_tenant_manager_slots",
        "How many slots currently exist, including all attached, secondary and in-progress operations",
    )
    .expect("failed to define a metric"),
    tenant_slot_writes: register_int_counter!(
        "pageserver_tenant_manager_slot_writes",
        "Writes to a tenant slot, including all of create/attach/detach/delete"
    )
    .expect("failed to define a metric"),
    unexpected_errors: register_int_counter!(
        "pageserver_tenant_manager_unexpected_errors_total",
        "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
    )
    .expect("failed to define a metric"),
 }
 });
 pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
@@ -1225,6 +1199,15 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
 pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
        redo_histogram_time_buckets!(),
    )
    .expect("failed to define a metric")
 });
 pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
@@ -1405,23 +1388,28 @@ impl TimelineMetrics {
        }
    }
-    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
+    pub fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }
-    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
+    pub fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }
-    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
+    pub fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }
-    pub(crate) fn resident_physical_size_get(&self) -> u64 {
+    pub fn resident_physical_size_set(&self, sz: u64) {
        self.resident_physical_size_gauge.set(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
    }
    pub fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
@@ -1901,9 +1889,6 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);
    // Tenant manager stats
    Lazy::force(&TENANT_MANAGER);
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
@@ -1919,6 +1904,7 @@ pub fn preinitialize_metrics() {
        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_WAIT_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
    ]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -40,7 +40,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, Scope, SwappableJwtAuth},
+    auth::{Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -55,20 +55,16 @@ use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
-use crate::tenant::mgr::get_active_tenant_with_timeout;
+use crate::tenant::mgr::GetTenantError;
-use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::{Tenant, Timeline};
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 // How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -122,7 +118,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -190,7 +186,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -218,34 +214,22 @@ async fn page_service_conn_main(
    // no write timeout is used, because the kernel is assumed to error writes after some time.
    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
-    let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
+    // timeout should be lower, but trying out multiple days for
-    let socket_timeout_ms = (|| {
+    // <https://github.com/neondatabase/neon/issues/4205>
-        fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
            // Exponential distribution for simulating
            // poor network conditions, expect about avg_timeout_ms to be around 15
            // in tests
            if let Some(avg_timeout_ms) = avg_timeout_ms {
                let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
                let u = rand::random::<f32>();
                ((1.0 - u).ln() / (-avg)) as u64
            } else {
                default_timeout_ms
            }
        });
        default_timeout_ms
    })();
    // A timeout here does not mean the client died, it can happen if it's just idle for
    // a while: we will tear down this PageServerHandler and instantiate a new one if/when
    // they reconnect.
    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(
        conf,
        broker_client,
        auth,
        connection_ctx,
        task_mgr::shutdown_token(),
    );
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
    match pgbackend
@@ -271,7 +255,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    claims: Option<Claims>,
    /// The context created for the lifetime of the connection
@@ -279,14 +263,19 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
    /// A token that should fire when the tenant transitions from
    /// attached state, or when the pageserver is shutting down.
    cancel: CancellationToken,
 }
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<SwappableJwtAuth>>,
+        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -294,6 +283,7 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
            cancel,
        }
    }
@@ -301,11 +291,7 @@ impl PageServerHandler {
    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
    /// in the flush.
-    async fn flush_cancellable<IO>(
+    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
        &self,
        pgb: &mut PostgresBackend<IO>,
        cancel: &CancellationToken,
    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
@@ -313,7 +299,7 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
-            _ = cancel.cancelled() => {
+            _ = self.cancel.cancelled() => {
                Err(QueryError::Shutdown)
            }
        )
@@ -322,7 +308,6 @@ impl PageServerHandler {
    fn copyin_stream<'a, IO>(
        &'a self,
        pgb: &'a mut PostgresBackend<IO>,
        cancel: &'a CancellationToken,
    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -332,7 +317,7 @@ impl PageServerHandler {
                let msg = tokio::select! {
                    biased;
-                    _ = cancel.cancelled() => {
+                    _ = self.cancel.cancelled() => {
                        // We were requested to shut down.
                        let msg = "pageserver is shutting down";
                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
@@ -372,7 +357,7 @@ impl PageServerHandler {
                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
                        // error can't happen here, ErrorResponse serialization should be always ok
                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                    }
                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
@@ -399,13 +384,12 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();
        // NOTE: pagerequests handler exits when connection is closed,
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Make request tracer if needed
-        let tenant = mgr::get_active_tenant_with_timeout(
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
            tenant_id,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await?;
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path = tenant
@@ -421,14 +405,9 @@ impl PageServerHandler {
            .get_timeline(timeline_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;
        // Avoid starting new requests if the timeline has already started shutting down,
        // and block timeline shutdown until this request is complete, or drops out due
        // to cancellation.
        let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;
        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
@@ -436,7 +415,7 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;
-                _ = timeline.cancel.cancelled() => {
+                _ = self.cancel.cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -511,24 +490,9 @@ impl PageServerHandler {
                }
            };
            if let Err(e) = &response {
                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
                // because wait_lsn etc will drop out
                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
                // is_canceled(): [`Timeline::shutdown`]` has entered
                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
                    return Err(QueryError::Shutdown);
                }
            }
            let response = response.unwrap_or_else(|e| {
                // print the all details to the log with {:#}, but for the client the
-                // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                // error message is enough
                // here includes cancellation which is not an error.
                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
                PagestreamBeMessage::Error(PagestreamErrorResponse {
                    message: e.to_string(),
@@ -536,7 +500,7 @@ impl PageServerHandler {
            });
            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb, &timeline.cancel).await?;
+            self.flush_cancellable(pgb).await?;
        }
        Ok(())
    }
@@ -558,14 +522,10 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
            tenant_id,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
@@ -583,9 +543,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
+        self.flush_cancellable(pgb).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -622,10 +582,9 @@ impl PageServerHandler {
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
        debug_assert_current_span_has_tenant_and_timeline_id();
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
-        let timeline = self
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(QueryError::Other(
@@ -639,8 +598,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");
@@ -833,9 +792,7 @@ impl PageServerHandler {
        let started = std::time::Instant::now();
        // check that the timeline exists
-        let timeline = self
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -850,7 +807,7 @@ impl PageServerHandler {
        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;
        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -902,7 +859,7 @@ impl PageServerHandler {
        }
        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;
        let basebackup_after = started
            .elapsed()
@@ -920,7 +877,7 @@ impl PageServerHandler {
    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -932,26 +889,7 @@ impl PageServerHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
+        check_permission(claims, tenant_id)
    }
    /// Shorthand for getting a reference to a Timeline of an Active tenant.
    async fn get_active_tenant_timeline(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
        let timeline = tenant
            .get_timeline(timeline_id, true)
            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
        Ok(timeline)
    }
 }
@@ -971,17 +909,16 @@ where
            .auth
            .as_ref()
            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
            .map_err(|e| QueryError::Unauthorized(e.0))?;
        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Unauthorized(
+            return Err(QueryError::Other(anyhow::anyhow!(
-                "jwt token scope is Tenant, but tenant id is missing".into(),
+                "jwt token scope is Tenant, but tenant id is missing"
-            ));
+            )));
        }
-        debug!(
+        info!(
-            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
+            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );
@@ -1003,13 +940,9 @@ where
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
        fail::fail_point!("simulated-bad-compute-connection", |_| {
            info!("Hit failpoint for bad connection");
            Err(QueryError::SimulatedConnectionError)
        });
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1115,9 +1048,7 @@ where
                .record("timeline_id", field::display(timeline_id));
            self.check_permission(Some(tenant_id))?;
-            let timeline = self
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
                .get_active_tenant_timeline(tenant_id, timeline_id)
                .await?;
            let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1301,12 +1232,7 @@ where
            self.check_permission(Some(tenant_id))?;
-            let tenant = get_active_tenant_with_timeout(
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
                tenant_id,
                ACTIVE_TENANT_TIMEOUT,
                &task_mgr::shutdown_token(),
            )
            .await?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1352,16 +1278,67 @@ where
    }
 }
 #[derive(thiserror::Error, Debug)]
 enum GetActiveTenantError {
    #[error(
        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
    )]
    WaitForActiveTimeout {
        latest_state: TenantState,
        wait_time: Duration,
    },
    #[error(transparent)]
    NotFound(GetTenantError),
    #[error(transparent)]
    WaitTenantActive(tenant::WaitToBecomeActiveError),
 }
 impl From<GetActiveTenantError> for QueryError {
    fn from(e: GetActiveTenantError) -> Self {
        match e {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
-                QueryError::Shutdown
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
        }
    }
 }
 /// Get active tenant.
 ///
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
 async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
    let tenant = match mgr::get_tenant(tenant_id, false).await {
        Ok(tenant) => tenant,
        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
        Err(GetTenantError::NotActive(_)) => {
            unreachable!("we're calling get_tenant with active_only=false")
        }
        Err(GetTenantError::Broken(_)) => {
            unreachable!("we're calling get_tenant with active_only=false")
        }
    };
    let wait_time = Duration::from_secs(30);
    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
        Ok(Ok(())) => Ok(tenant),
        // no .context(), the error message is good enough and some tests depend on it
        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
        Err(_) => {
            let latest_state = tenant.current_state();
            if latest_state == TenantState::Active {
                Ok(tenant)
            } else {
                Err(GetActiveTenantError::WaitForActiveTimeout {
                    latest_state,
                    wait_time,
                })
            }
            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
 }
@@ -1382,3 +1359,18 @@ impl From<GetActiveTimelineError> for QueryError {
        }
    }
 }
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
 async fn get_active_tenant_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    ctx: &RequestContext,
 ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
    Ok(timeline)
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -43,17 +44,6 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
                Self::Cancelled
            }
            _ => Self::Other(pre.into()),
        }
    }
 }
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
    #[error("Relation Already Exists")]
@@ -562,8 +552,7 @@ impl Timeline {
                Err(e) => Err(PageReconstructError::from(e)),
            },
            Err(e) => {
-                // This is expected: historical databases do not have the key.
+                warn!("Failed to get info about AUX files: {}", e);
                debug!("Failed to get info about AUX files: {}", e);
                Ok(HashMap::new())
            }
        }
@@ -577,22 +566,30 @@ impl Timeline {
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+            for rel in self
-                if self.cancel.is_cancelled() {
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
                .await
                .context("list rels")?
            {
                if cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn, ctx).await?;
+                let mut buf = self
                    .get(relsize_key, lsn, ctx)
                    .await
                    .with_context(|| format!("read relation size of {rel:?}"))?;
                let relsize = buf.get_u32_le();
                total_size += relsize as u64;
@@ -678,9 +675,8 @@ impl Timeline {
        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+        result.add_key(AUX_FILES_KEY);
-            result.add_key(AUX_FILES_KEY);
+
        }
        Ok(result.to_keyspace())
    }
@@ -1205,8 +1201,7 @@ impl<'a> DatadirModification<'a> {
        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
            Ok(buf) => AuxFilesDirectory::des(&buf)?,
            Err(e) => {
-                // This is expected: historical databases do not have the key.
+                warn!("Failed to get info about AUX files: {}", e);
                debug!("Failed to get info about AUX files: {}", e);
                AuxFilesDirectory {
                    files: HashMap::new(),
                }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -299,6 +299,10 @@ pub enum TaskKind {
 #[derive(Default)]
 struct MutableTaskState {
    /// Tenant and timeline that this task is associated with.
    tenant_id: Option<TenantId>,
    timeline_id: Option<TimelineId>,
    /// Handle for waiting for the task to exit. It can be None, if the
    /// the task has already exited.
    join_handle: Option<JoinHandle<()>>,
@@ -315,11 +319,6 @@ struct PageServerTask {
    // To request task shutdown, just cancel this token.
    cancel: CancellationToken,
    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
    tenant_id: Option<TenantId>,
    timeline_id: Option<TimelineId>,
    mutable: Mutex<MutableTaskState>,
 }
@@ -345,9 +344,11 @@ where
        kind,
        name: name.to_string(),
        cancel: cancel.clone(),
-        tenant_id,
+        mutable: Mutex::new(MutableTaskState {
-        timeline_id,
+            tenant_id,
-        mutable: Mutex::new(MutableTaskState { join_handle: None }),
+            timeline_id,
            join_handle: None,
        }),
    });
    TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
@@ -417,6 +418,8 @@ async fn task_finish(
    let mut shutdown_process = false;
    {
        let task_mut = task.mutable.lock().unwrap();
        match result {
            Ok(Ok(())) => {
                debug!("Task '{}' exited normally", task_name);
@@ -425,13 +428,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                }
            }
@@ -439,13 +442,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                }
            }
@@ -457,6 +460,17 @@ async fn task_finish(
    }
 }
 // expected to be called from the task of the given id.
 pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
    CURRENT_TASK.with(|ct| {
        let mut task_mut = ct.mutable.lock().unwrap();
        task_mut.tenant_id = tenant_id;
        task_mut.timeline_id = timeline_id;
    });
 }
 /// Is there a task running that matches the criteria
 /// Signal and wait for tasks to shut down.
 ///
 ///
@@ -479,16 +493,17 @@ pub async fn shutdown_tasks(
    {
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            let task_mut = task.mutable.lock().unwrap();
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_id.is_none() || task.tenant_id == tenant_id)
+                && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
-                && (timeline_id.is_none() || task.timeline_id == timeline_id)
+                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
                task.cancel.cancel();
                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
-                    task.tenant_id,
+                    task_mut.tenant_id,
-                    task.timeline_id,
+                    task_mut.timeline_id,
                ));
            }
        }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -327,7 +327,7 @@ mod tests {
                let mut sz: u16 = rng.gen();
                // Make 50% of the arrays small
                if rng.gen() {
-                    sz &= 63;
+                    sz |= 63;
                }
                random_array(sz.into())
            })
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,10 +3,10 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, warn, Instrument, Span};
+use tracing::{error, info, instrument, warn, Instrument, Span};
 use utils::{
    backoff, completion, crashsafe, fs_ext,
@@ -21,33 +21,26 @@ use crate::{
 };
 use super::{
-    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
+    mgr::{GetTenantError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
 };
 const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),
    #[error("Tenant not attached")]
    NotAttached,
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),
    #[error("Tenant deletion is already in progress")]
    AlreadyInProgress,
    #[error("Tenant map slot error {0}")]
    SlotError(#[from] TenantSlotError),
    #[error("Tenant map slot upsert error {0}")]
    SlotUpsertError(#[from] TenantSlotUpsertError),
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),
@@ -67,7 +60,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
 }
 async fn create_remote_delete_mark(
@@ -157,8 +150,7 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
    // Assert timelines dir is empty.
    if !fs_ext::is_directory_empty(timelines_path).await? {
        // Display first 10 items in directory
-        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
        let list = &list.into_iter().take(10).collect::<Vec<_>>();
        return Err(DeleteTenantError::Other(anyhow::anyhow!(
            "Timelines directory is not empty after all timelines deletion: {list:?}"
        )));
@@ -247,6 +239,32 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }
 pub(crate) async fn remote_delete_mark_exists(
    conf: &PageServerConf,
    tenant_id: &TenantId,
    remote_storage: &GenericRemoteStorage,
 ) -> anyhow::Result<bool> {
    // If remote storage is there we rely on it
    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
    let result = backoff::retry(
        || async { remote_storage.download(&remote_mark_path).await },
        |e| matches!(e, DownloadError::NotFound),
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
        "fetch_tenant_deletion_mark",
        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
    )
    .await;
    match result {
        Ok(_) => Ok(true),
        Err(DownloadError::NotFound) => Ok(false),
        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
    }
 }
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -258,9 +276,10 @@ async fn cleanup_remaining_fs_traces(
 /// 6. Remove remote mark
 /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
+/// There are three entrypoints to the process:
 /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
 /// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
 ///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
@@ -282,12 +301,12 @@ impl DeleteTenantFlow {
    pub(crate) async fn run(
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
+        tenant_id: TenantId,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();
-        let mut guard = Self::prepare(&tenant).await?;
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
@@ -359,7 +378,7 @@ impl DeleteTenantFlow {
    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
-        remote_mark_exists: bool,
+        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
        let acquire = |t: &Tenant| {
@@ -370,25 +389,66 @@ impl DeleteTenantFlow {
            )
        };
        if remote_mark_exists {
            return Ok(acquire(tenant));
        }
        let tenant_id = tenant.tenant_id;
        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
            return Ok(acquire(tenant));
        }
        let remote_storage = match remote_storage {
            Some(remote_storage) => remote_storage,
            None => return Ok(None),
        };
        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
            Ok(acquire(tenant))
        } else {
            Ok(None)
        }
    }
    pub(crate) async fn resume_from_load(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
        init_order: Option<&InitializationOrder>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
        tenant
            .set_stopping(progress, true, false)
            .await
            .expect("cant be stopping or broken");
        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
        if let Some(background) = background_jobs_can_start {
            info!("waiting for backgound jobs barrier");
            background.clone().wait().await;
            info!("ready for backgound jobs barrier");
        }
        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
        if timelines_path.exists() {
            tenant.load(init_order, None, ctx).await.context("load")?;
        }
        Self::background(
            guard,
            tenant.conf,
            tenant.remote_storage.clone(),
            tenants,
            tenant,
        )
        .await
    }
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
-        preload: Option<TenantPreload>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -399,7 +459,7 @@ impl DeleteTenantFlow {
            .expect("cant be stopping or broken");
        tenant
-            .attach(init_order, preload, ctx)
+            .attach(ctx, super::AttachMarkerMode::Expect)
            .await
            .context("attach")?;
@@ -414,8 +474,15 @@ impl DeleteTenantFlow {
    }
    async fn prepare(
-        tenant: &Arc<Tenant>,
+        tenants: &tokio::sync::RwLock<TenantsMap>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
+        tenant_id: TenantId,
    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
        let m = tenants.read().await;
        let tenant = m
            .get(&tenant_id)
            .ok_or(GetTenantError::NotFound(tenant_id))?;
        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
        // so at least for now allow deletions only for active tenants. TODO recheck
        // Broken and Stopping is needed for retries.
@@ -449,14 +516,14 @@ impl DeleteTenantFlow {
            )));
        }
-        Ok(guard)
+        Ok((Arc::clone(tenant), guard))
    }
    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
        let tenant_id = tenant.tenant_id;
@@ -489,7 +556,7 @@ impl DeleteTenantFlow {
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -537,18 +604,10 @@ impl DeleteTenantFlow {
            .await
            .context("cleanup_remaining_fs_traces")?;
-        {
+        let mut locked = tenants.write().await;
-            let mut locked = tenants.write().unwrap();
+        if locked.remove(&tenant.tenant_id).is_none() {
-            if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
-                warn!("Tenant got removed from tenants map during deletion");
+        };
            };
            // FIXME: we should not be modifying this from outside of mgr.rs.
            // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
            crate::metrics::TENANT_MANAGER
                .tenant_slots
                .set(locked.len() as u64);
        }
        *guard = Self::Finished;
--- a/pageserver/src/tenant/disk_btree_test_data.rs
+++ b/pageserver/src/tenant/disk_btree_test_data.rs
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,10 +639,147 @@ impl LayerMap {
        }
        println!("historic_layers:");
-        for desc in self.iter_historic_layers() {
+        for layer in self.iter_historic_layers() {
-            desc.dump();
+            layer.dump(verbose, ctx)?;
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::LayerMap;
    use crate::tenant::storage_layer::LayerFileName;
    use std::str::FromStr;
    use std::sync::Arc;
    mod l0_delta_layers_updated {
        use crate::tenant::{
            storage_layer::{AsLayerDesc, PersistentLayerDesc},
            timeline::layer_manager::LayerFileManager,
        };
        use super::*;
        struct LayerObject(PersistentLayerDesc);
        impl AsLayerDesc for LayerObject {
            fn layer_desc(&self) -> &PersistentLayerDesc {
                &self.0
            }
        }
        impl LayerObject {
            fn new(desc: PersistentLayerDesc) -> Self {
                LayerObject(desc)
            }
        }
        type TestLayerFileManager = LayerFileManager<LayerObject>;
        #[test]
        fn for_full_range_delta() {
            // l0_delta_layers are used by compaction, and should observe all buffered updates
            l0_delta_layers_updated_scenario(
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
                 true
             )
        }
        #[test]
        fn for_non_full_range_delta() {
            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
            l0_delta_layers_updated_scenario(
                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
                 // because not full range
                 false
             )
        }
        #[test]
        fn for_image() {
            l0_delta_layers_updated_scenario(
                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
                 // code only checks if it is a full range layer, doesn't care about images, which must
                 // mean we should in practice never have full range images
                 false
             )
        }
        #[test]
        fn replacing_missing_l0_is_notfound() {
            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
            // however only happen for precondition failures.
            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
            let layer = LayerFileName::from_str(layer).unwrap();
            let layer = PersistentLayerDesc::from(layer);
            // same skeletan construction; see scenario below
            let not_found = Arc::new(LayerObject::new(layer.clone()));
            let new_version = Arc::new(LayerObject::new(layer));
            // after the immutable storage state refactor, the replace operation
            // will not use layer map any more. We keep it here for consistency in test cases
            // and can remove it in the future.
            let _map = LayerMap::default();
            let mut mapping = TestLayerFileManager::new();
            mapping
                .replace_and_verify(not_found, new_version)
                .unwrap_err();
        }
        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
            let name = LayerFileName::from_str(layer_name).unwrap();
            let skeleton = PersistentLayerDesc::from(name);
            let remote = Arc::new(LayerObject::new(skeleton.clone()));
            let downloaded = Arc::new(LayerObject::new(skeleton));
            let mut map = LayerMap::default();
            let mut mapping = LayerFileManager::new();
            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
            let expected_in_counts = (1, usize::from(expected_l0));
            map.batch_update()
                .insert_historic(remote.layer_desc().clone());
            mapping.insert(remote.clone());
            assert_eq!(
                count_layer_in(&map, remote.layer_desc()),
                expected_in_counts
            );
            mapping
                .replace_and_verify(remote, downloaded.clone())
                .expect("name derived attributes are the same");
            assert_eq!(
                count_layer_in(&map, downloaded.layer_desc()),
                expected_in_counts
            );
            map.batch_update().remove_historic(downloaded.layer_desc());
            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
        }
        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
                .filter(|x| x.key() == layer.key())
                .count();
            let l0s = map
                .get_level0_deltas()
                .expect("why does this return a result");
            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
            (historic, l0)
        }
    }
 }
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -406,123 +406,4 @@ mod tests {
            METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
        );
    }
    #[test]
    fn test_metadata_bincode_serde() {
        let original_metadata = TimelineMetadata::new(
            Lsn(0x200),
            Some(Lsn(0x100)),
            Some(TIMELINE_ID),
            Lsn(0),
            Lsn(0),
            Lsn(0),
            // Any version will do here, so use the default
            crate::DEFAULT_PG_VERSION,
        );
        let metadata_bytes = original_metadata
            .to_bytes()
            .expect("Cannot create bytes array from metadata");
        let metadata_bincode_be_bytes = original_metadata
            .ser()
            .expect("Cannot serialize the metadata");
        // 8 bytes for the length of the vector
        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
        let expected_bincode_bytes = {
            let mut temp = vec![];
            let len_bytes = metadata_bytes.len().to_be_bytes();
            temp.extend_from_slice(&len_bytes);
            temp.extend_from_slice(&metadata_bytes);
            temp
        };
        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
        // Deserialized metadata has the metadata header, which is different from the serialized one.
        //   Reference: TimelineMetaData::to_bytes()
        let expected_metadata = {
            let mut temp_metadata = original_metadata;
            let body_bytes = temp_metadata
                .body
                .ser()
                .expect("Cannot serialize the metadata body");
            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
            let hdr = TimelineMetadataHeader {
                size: metadata_size as u16,
                format_version: METADATA_FORMAT_VERSION,
                checksum: crc32c::crc32c(&body_bytes),
            };
            temp_metadata.hdr = hdr;
            temp_metadata
        };
        assert_eq!(deserialized_metadata, expected_metadata);
    }
    #[test]
    fn test_metadata_bincode_serde_ensure_roundtrip() {
        let original_metadata = TimelineMetadata::new(
            Lsn(0x200),
            Some(Lsn(0x100)),
            Some(TIMELINE_ID),
            Lsn(0),
            Lsn(0),
            Lsn(0),
            // Any version will do here, so use the default
            crate::DEFAULT_PG_VERSION,
        );
        let expected_bytes = vec![
            /* bincode length encoding bytes */
            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
            /* TimelineMetadataHeader */
            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
            1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119,
            136, // ancestor_timeline (17 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
            0, 0, 0, 15, // pg_version (4 bytes)
            /* padding bytes */
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0,
        ];
        let metadata_ser_bytes = original_metadata.ser().unwrap();
        assert_eq!(metadata_ser_bytes, expected_bytes);
        let expected_metadata = {
            let mut temp_metadata = original_metadata;
            let body_bytes = temp_metadata
                .body
                .ser()
                .expect("Cannot serialize the metadata body");
            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
            let hdr = TimelineMetadataHeader {
                size: metadata_size as u16,
                format_version: METADATA_FORMAT_VERSION,
                checksum: crc32c::crc32c(&body_bytes),
            };
            temp_metadata.hdr = hdr;
            temp_metadata
        };
        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
        assert_eq!(des_metadata, expected_metadata);
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -57,7 +57,8 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    fsync_in_thread_pool(paths)
 }
-/// Parallel fsync asynchronously.
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
 /// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
 pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -167,15 +167,39 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
 //!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
 //!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
 //! - After the above is done for each timeline, open the tenant for business by
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
 //! We keep track of the fact that a client is in `Attaching` state in a marker
 //! file on the local disk. This is critical because, when we restart the pageserver,
 //! we do not want to do the `List timelines` step for each tenant that has already
 //! been successfully attached (for performance & cost reasons).
 //! Instead, for a tenant without the attach marker file, we assume that the
 //! local state is in sync or ahead of the remote state. This includes the list
 //! of all of the tenant's timelines, which is particularly critical to be up-to-date:
 //! if there's a timeline on the remote that the pageserver doesn't know about,
 //! the GC will not consider its branch point, leading to data loss.
 //! So, for a tenant with the attach marker file, we know that we do not yet have
 //! persisted all the remote timeline's metadata files locally. To exclude the
 //! risk above, we re-run the procedure for such tenants
 //!
 //! # Operating Without Remote Storage
 //!
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
 //! Theoretically, it should be ok to remove and re-add remote storage configuration to
 //! the pageserver config at any time, since it doesn't make a difference to
 //! [`Timeline::load_layer_map`].
 //! Of course, the remote timeline dir must not change while we have de-configured
 //! remote storage, i.e., the pageserver must remain the owner of the given prefix
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
@@ -187,7 +211,8 @@ mod upload;
 use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
-
+// re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -212,7 +237,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::storage_layer::AsLayerDesc;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
@@ -230,13 +255,10 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;
-use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
+use super::storage_layer::LayerFileName;
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
 pub(crate) use download::{is_temp_download_file, list_remote_timelines};
 pub(crate) use index::LayerFileMetadata;
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -446,10 +468,7 @@ impl RemoteTimelineClient {
    //
    /// Download index file
-    pub async fn download_index_file(
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
        &self,
        cancel: CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
            &RemoteOpKind::Download,
@@ -463,7 +482,6 @@ impl RemoteTimelineClient {
            &self.tenant_id,
            &self.timeline_id,
            self.generation,
            cancel,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -609,203 +627,101 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub(crate) fn schedule_layer_file_upload(
+    pub fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer: ResidentLayer,
+        layer_file_name: &LayerFileName,
        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
        self.schedule_layer_file_upload0(upload_queue, layer);
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }
    fn schedule_layer_file_upload0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        layer: ResidentLayer,
    ) {
        let metadata = layer.metadata();
        upload_queue
            .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer_file_name.clone(), layer_metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-        info!("scheduled layer file upload {layer}");
+        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        info!("scheduled layer file upload {layer_file_name}");
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }
    /// Launch a delete operation in the background.
    ///
-    /// The operation does not modify local filesystem state.
+    /// The operation does not modify local state but assumes the local files have already been
    /// deleted, and is used to mirror those changes to remote.
    ///
    /// Note: This schedules an index file upload before the deletions.  The
-    /// deletion won't actually be performed, until all previously scheduled
+    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
        let with_generations =
            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
        // Launch the tasks immediately, if possible
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }
    /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
    /// layer files, leaving them dangling.
    ///
    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
    /// is invoked on them.
    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
        // just forget the return value; after uploading the next index_part.json, we can consider
        // the layer files as "dangling". this is fine, at worst case we create work for the
        // scrubber.
        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }
    /// Update the remote index file, removing the to-be-deleted files from the index,
    /// allowing scheduling of actual deletions later.
    fn schedule_unlinking_of_layers_from_index_part0<I>(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
    ) -> Vec<(LayerFileName, Generation)>
    where
        I: IntoIterator<Item = LayerFileName>,
    {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();
-        // Decorate our list of names with each name's generation, dropping
+        // Update the remote index file, removing the to-be-deleted files from the index,
-        // names that are unexpectedly missing from our metadata.
+        // before deleting the actual files.
-        let with_generations: Vec<_> = names
+        //
-            .into_iter()
+        // Once we start removing files from upload_queue.latest_files, there's
-            .filter_map(|name| {
+        // no going back! Otherwise, some of the files would already be removed
-                let meta = upload_queue.latest_files.remove(&name);
+        // from latest_files, but not yet scheduled for deletion. Use a closure
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
            // Decorate our list of names with each name's generation, dropping
            // makes that are unexpectedly missing from our metadata.
            let with_generations: Vec<_> = names
                .into_iter()
                .filter_map(|name| {
                    // Remove from latest_files, learning the file's remote generation in the process
                    let meta = upload_queue.latest_files.remove(&name);
-                if let Some(meta) = meta {
+                    if let Some(meta) = meta {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                    Some((name, meta.generation))
+                        Some((name, meta.generation))
-                } else {
+                    } else {
-                    // This can only happen if we forgot to to schedule the file upload
+                        // This can only happen if we forgot to to schedule the file upload
-                    // before scheduling the delete. Log it because it is a rare/strange
+                        // before scheduling the delete. Log it because it is a rare/strange
-                    // situation, and in case something is misbehaving, we'd like to know which
+                        // situation, and in case something is misbehaving, we'd like to know which
-                    // layers experienced this.
+                        // layers experienced this.
-                    info!("Deleting layer {name} not found in latest_files list, never uploaded?");
+                        info!(
-                    None
+                            "Deleting layer {name} not found in latest_files list, never uploaded?"
-                }
+                        );
-            })
+                        None
-            .collect();
+                    }
                })
                .collect();
-        #[cfg(feature = "testing")]
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-        for (name, gen) in &with_generations {
+                self.schedule_index_upload(upload_queue, metadata);
            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
                if &unexpected == gen {
                    tracing::error!("{name} was unlinked twice with same generation");
                } else {
                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
                }
            }
        }
-        // after unlinking files from the upload_queue.latest_files we must always schedule an
+            for (name, gen) in &with_generations {
-        // index_part update, because that needs to be uploaded before we can actually delete the
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
        // files.
        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
            self.schedule_index_upload(upload_queue, metadata);
        }
        with_generations
    }
    /// Schedules deletion for layer files which have previously been unlinked from the
    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
    pub(crate) fn schedule_deletion_of_unlinked(
        self: &Arc<Self>,
        layers: Vec<(LayerFileName, Generation)>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
        self.schedule_deletion_of_unlinked0(upload_queue, layers);
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        with_generations: Vec<(LayerFileName, Generation)>,
    ) {
        for (name, gen) in &with_generations {
            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
        }
        #[cfg(feature = "testing")]
        for (name, gen) in &with_generations {
            match upload_queue.dangling_files.remove(name) {
                Some(same) if &same == gen => { /* expected */ }
                Some(other) => {
                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
                }
                None => {
                    tracing::error!("{name} was unlinked but was not dangling");
                }
            }
        }
-        // schedule the actual deletions
+            // schedule the actual deletions
-        let op = UploadOp::Delete(Delete {
+            let op = UploadOp::Delete(Delete {
-            layers: with_generations,
+                layers: with_generations,
-        });
+            });
-        self.calls_unfinished_metric_begin(&op);
+            self.calls_unfinished_metric_begin(&op);
-        upload_queue.queued_operations.push_back(op);
+            upload_queue.queued_operations.push_back(op);
    }
    /// Schedules a compaction update to the remote `index_part.json`.
    ///
    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
    pub(crate) fn schedule_compaction_update(
        self: &Arc<Self>,
        compacted_from: &[Layer],
        compacted_to: &[ResidentLayer],
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
        for layer in compacted_to {
            self.schedule_layer_file_upload0(upload_queue, layer.clone());
        }
        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
        self.launch_queued_tasks(upload_queue);
            // Launch the tasks immediately, if possible
            self.launch_queued_tasks(upload_queue);
        };
        no_bail_here();
        Ok(())
    }
@@ -1177,12 +1093,16 @@ impl RemoteTimelineClient {
            }
            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = layer.local_path();
+                    let path = self
                        .conf
                        .timeline_path(&self.tenant_id, &self.timeline_id)
                        .join(layer_file_name.file_name());
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        path,
+                        &path,
                        layer_metadata,
                        self.generation,
                    )
@@ -1456,8 +1376,6 @@ impl RemoteTimelineClient {
                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                        #[cfg(feature = "testing")]
                        dangling_files: HashMap::default(),
                    };
                    let upload_queue = std::mem::replace(
@@ -1501,6 +1419,13 @@ impl RemoteTimelineClient {
            }
        }
    }
    pub(crate) fn get_layer_metadata(
        &self,
        name: &LayerFileName,
    ) -> anyhow::Result<Option<LayerFileMetadata>> {
        self.upload_queue.lock().unwrap().get_layer_metadata(name)
    }
 }
 pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
@@ -1542,7 +1467,7 @@ pub fn remote_index_path(
 }
 /// Given the key of an index, parse out the generation part of the name
-pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
+pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
        Some(f) => f,
        None => {
@@ -1588,7 +1513,6 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
            storage_layer::Layer,
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1731,11 +1655,7 @@ mod tests {
        let client = timeline.remote_client.as_ref().unwrap();
        // Download back the index.json, and check that the list of files is correct
-        let initial_index_part = match client
+        let initial_index_part = match client.download_index_file().await.unwrap() {
            .download_index_file(CancellationToken::new())
            .await
            .unwrap()
        {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1761,29 +1681,32 @@ mod tests {
        let generation = harness.generation;
        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
        let content_1 = dummy_contents("foo");
        let content_2 = dummy_contents("bar");
        let content_3 = dummy_contents("baz");
-        let layers = [
+        for (filename, content) in [
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
+            (&layer_file_name_1, &content_1),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
+            (&layer_file_name_2, &content_2),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
+            (&layer_file_name_3, &content_3),
-        ]
+        ] {
-        .into_iter()
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+        }
            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
            Layer::for_resident(
                harness.conf,
                &timeline,
                name,
                LayerFileMetadata::new(contents.len() as u64, generation),
            )
        }).collect::<Vec<_>>();
        client
-            .schedule_layer_file_upload(layers[0].clone())
+            .schedule_layer_file_upload(
                &layer_file_name_1,
                &LayerFileMetadata::new(content_1.len() as u64, generation),
            )
            .unwrap();
        client
-            .schedule_layer_file_upload(layers[1].clone())
+            .schedule_layer_file_upload(
                &layer_file_name_2,
                &LayerFileMetadata::new(content_2.len() as u64, generation),
            )
            .unwrap();
        // Check that they are started immediately, not queued
@@ -1824,11 +1747,7 @@ mod tests {
        }
        // Download back the index.json, and check that the list of files is correct
-        let index_part = match client
+        let index_part = match client.download_index_file().await.unwrap() {
            .download_index_file(CancellationToken::new())
            .await
            .unwrap()
        {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1841,42 +1760,38 @@ mod tests {
                .collect(),
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_2.file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);
        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(layers[2].clone())
+            .schedule_layer_file_upload(
                &layer_file_name_3,
                &LayerFileMetadata::new(content_3.len() as u64, generation),
            )
            .unwrap();
        // this is no longer consistent with how deletion works with Layer::drop, but in this test
        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
            // Deletion schedules upload of the index file, and the file deletion itself
-            assert_eq!(upload_queue.queued_operations.len(), 2);
+            assert!(upload_queue.queued_operations.len() == 2);
-            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
+            assert!(upload_queue.inprogress_tasks.len() == 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
+            assert!(upload_queue.num_inprogress_layer_uploads == 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert!(upload_queue.num_inprogress_deletions == 0);
-            assert_eq!(
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
                0
            );
        }
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_2.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1890,8 +1805,8 @@ mod tests {
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_2.file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &layer_file_name_3.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1920,13 +1835,6 @@ mod tests {
        )
        .unwrap();
        let layer_file_1 = Layer::for_resident(
            harness.conf,
            &timeline,
            layer_file_name_1.clone(),
            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
        );
        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1962,7 +1870,10 @@ mod tests {
        let actual_a = get_bytes_started_stopped();
        client
-            .schedule_layer_file_upload(layer_file_1.clone())
+            .schedule_layer_file_upload(
                &layer_file_name_1,
                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
            )
            .unwrap();
        let actual_b = get_bytes_started_stopped();
@@ -2027,7 +1938,7 @@ mod tests {
        let client = test_state.build_client(get_generation);
        let download_r = client
-            .download_index_file(CancellationToken::new())
+            .download_index_file()
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::Generation;
+use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
@@ -170,43 +170,53 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
    tenant_id: TenantId,
-    cancel: CancellationToken,
+) -> anyhow::Result<HashSet<TimelineId>> {
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
    let remote_path = remote_timelines_path(&tenant_id);
    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });
-    let listing = download_retry_forever(
+    let timelines = download_retry(
-        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+        || storage.list_prefixes(Some(&remote_path)),
-        &format!("list timelines for {tenant_id}"),
+        &format!("list prefixes for {tenant_id}"),
        cancel,
    )
    .await?;
-    let mut timeline_ids = HashSet::new();
+    if timelines.is_empty() {
-    let mut other_prefixes = HashSet::new();
+        anyhow::bail!("no timelines found on the remote storage")
    }
    let mut timeline_ids = HashSet::new();
    for timeline_remote_storage_key in timelines {
        if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
            // A `deleted` key within `timelines/` is a marker file, not a timeline.  Ignore it.
            // This code will be removed in https://github.com/neondatabase/neon/pull/5580
            continue;
        }
    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;
-        match object_name.parse::<TimelineId>() {
+        let timeline_id: TimelineId = object_name
-            Ok(t) => timeline_ids.insert(t),
+            .parse()
-            Err(_) => other_prefixes.insert(object_name.to_string()),
+            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
-        };
+
        // list_prefixes is assumed to return unique names. Ensure this here.
        // NB: it's safer to bail out than warn-log this because the pageserver
        //     needs to absolutely know about _all_ timelines that exist, so that
        //     GC knows all the branchpoints. If we skipped over a timeline instead,
        //     GC could delete a layer that's still needed by that timeline.
        anyhow::ensure!(
            !timeline_ids.contains(&timeline_id),
            "list_prefixes contains duplicate timeline id {timeline_id}"
        );
        timeline_ids.insert(timeline_id);
    }
-    for key in listing.keys {
+    Ok(timeline_ids)
        let object_name = key
            .object_name()
            .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
        other_prefixes.insert(object_name.to_string());
    }
    Ok((timeline_ids, other_prefixes))
 }
 async fn do_download_index_part(
@@ -214,11 +224,10 @@ async fn do_download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    index_generation: Generation,
    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
-    let index_part_bytes = download_retry_forever(
+    let index_part_bytes = download_retry(
        || async {
            let mut index_part_download = storage.download(&remote_path).await?;
@@ -233,7 +242,6 @@ async fn do_download_index_part(
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
        cancel,
    )
    .await?;
@@ -255,28 +263,19 @@ pub(super) async fn download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
            .await;
    }
    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
+    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
        storage,
        tenant_id,
        timeline_id,
        my_generation,
        cancel.clone(),
    )
    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -296,14 +295,8 @@ pub(super) async fn download_index_part(
    //    we want to find the most recent index from a previous generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
+    let res =
-        storage,
+        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
        tenant_id,
        timeline_id,
        my_generation.previous(),
        cancel.clone(),
    )
    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!("Found index_part from previous generation");
@@ -347,14 +340,13 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
+            do_download_index_part(storage, tenant_id, timeline_id, g).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
                .await
        }
    }
 }
@@ -384,23 +376,3 @@ where
    )
    .await
 }
 async fn download_retry_forever<T, O, F>(
    op: O,
    description: &str,
    cancel: CancellationToken,
 ) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
 {
    backoff::retry(
        op,
        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        u32::MAX,
        description,
        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
    )
    .await
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	ed3e3b6f61	pageserver: enable setting a target disk range	2023-10-25 14:39:12 +01:00
John Spray	098ef0956b	pageserver: publish disk eviction status	2023-10-25 14:35:32 +01:00
		`@@ -1,3 +0,0 @@`
			`pub mod heavier_once_cell;`

			`pub mod gate;`