pageserver: cargo fmt

This is in a separate commit to make the previous ones more readable.
pageserver: respect task_mgr cancellation in metrics task
2026-04-28 03:40:37 +00:00 · 2023-08-03 14:12:46 +01:00 · 2023-08-03 10:01:12 +01:00 · 2023-08-03 09:53:38 +01:00
65 changed files with 1111 additions and 1821 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,7 +794,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.15.0-alpha1
+      VM_BUILDER_VERSION: v0.13.1

    steps:
      - name: Checkout
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3253,7 +3253,6 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
- "scopeguard",
 "serde",
 "serde_json",
 "tempfile",
--- a/README.md
+++ b/README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry
+libcurl4-openssl-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry
+  protobuf-devel libcurl-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
@@ -235,13 +235,6 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ./scripts/pytest
 ```

-By default, this runs both debug and release modes, and all supported postgres versions. When
-testing locally, it is convenient to run just run one set of permutations, like this:
-
-```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
-```
-
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -20,7 +20,6 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
 tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
-scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -10,7 +10,6 @@ use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
@@ -23,7 +22,6 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
    sync::Semaphore,
@@ -38,9 +36,82 @@ use crate::{

 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;

-pub(super) mod metrics;
+pub(super) mod metrics {
+    use metrics::{register_int_counter_vec, IntCounterVec};
+    use once_cell::sync::Lazy;

-use self::metrics::{AttemptOutcome, RequestKind};
+    static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_requests_count",
+            "Number of s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_failures_count",
+            "Number of failed s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    pub fn inc_get_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
+    }
+
+    pub fn inc_get_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["get_object"])
+            .inc();
+    }
+
+    pub fn inc_put_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
+    }
+
+    pub fn inc_put_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["put_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_object() {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_objects(count: u64) {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
+    pub fn inc_delete_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_objects_fail(count: u64) {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
+    pub fn inc_list_objects() {
+        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
+    }
+
+    pub fn inc_list_objects_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["list_objects"])
+            .inc();
+    }
+}

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -68,29 +139,18 @@ impl S3Bucket {
            aws_config.bucket_name
        );

-        let region = Some(Region::new(aws_config.bucket_region.clone()));
-
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else("token", {
-                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build()
-            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

        let mut config_builder = Config::builder()
-            .region(region)
+            .region(Region::new(aws_config.bucket_region.clone()))
            .credentials_cache(CredentialsCache::lazy())
            .credentials_provider(credentials_provider);

@@ -153,46 +213,17 @@ impl S3Bucket {
        }
    }

-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .expect("semaphore is never closed");
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
-
-        permit
-    }
-
-    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
-        let started_at = start_counting_cancelled_wait(kind);
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
        let permit = self
            .concurrency_limiter
            .clone()
            .acquire_owned()
            .await
-            .expect("semaphore is never closed");
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
-        permit
-    }
-
-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let kind = RequestKind::Get;
-        let permit = self.owned_permit(kind).await;
+            .context("Concurrency limiter semaphore got closed during S3 download")
+            .map_err(DownloadError::Other)?;

        metrics::inc_get_object();

-        let started_at = start_measuring_requests(kind);
-
        let get_object = self
            .client
            .get_object()
@@ -202,34 +233,26 @@ impl S3Bucket {
            .send()
            .await;

-        let started_at = ScopeGuard::into_inner(started_at);
-
-        if get_object.is_err() {
-            metrics::inc_get_object_fail();
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                kind,
-                AttemptOutcome::Err,
-                started_at,
-            );
-        }
-
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
-                        started_at,
-                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
+                        object_output.body.into_async_read(),
                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                Err(DownloadError::NotFound)
            }
-            Err(e) => Err(DownloadError::Other(
-                anyhow::Error::new(e).context("download s3 object"),
-            )),
+            Err(e) => {
+                metrics::inc_get_object_fail();
+                Err(DownloadError::Other(anyhow::anyhow!(
+                    "Failed to download S3 object: {e}"
+                )))
+            }
        }
    }
 }
@@ -260,54 +283,6 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
    }
 }

-pin_project_lite::pin_project! {
-    /// Times and tracks the outcome of the request.
-    struct TimedDownload<S> {
-        started_at: std::time::Instant,
-        outcome: metrics::AttemptOutcome,
-        #[pin]
-        inner: S
-    }
-
-    impl<S> PinnedDrop for TimedDownload<S> {
-        fn drop(mut this: Pin<&mut Self>) {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
-        }
-    }
-}
-
-impl<S: AsyncRead> TimedDownload<S> {
-    fn new(started_at: std::time::Instant, inner: S) -> Self {
-        TimedDownload {
-            started_at,
-            outcome: metrics::AttemptOutcome::Cancelled,
-            inner,
-        }
-    }
-}
-
-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        let before = buf.filled().len();
-        let read = std::task::ready!(this.inner.poll_read(cx, buf));
-
-        let read_eof = buf.filled().len() == before;
-
-        match read {
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
-            Ok(()) => { /* still in progress */ }
-            Err(_) => *this.outcome = AttemptOutcome::Err,
-        }
-
-        std::task::Poll::Ready(read)
-    }
-}
-
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    /// See the doc for `RemoteStorage::list_prefixes`
@@ -316,8 +291,6 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let kind = RequestKind::List;
-
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
@@ -334,11 +307,15 @@ impl RemoteStorage for S3Bucket {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
-
        loop {
-            let _guard = self.permit(kind).await;
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
+
            metrics::inc_list_objects();
-            let started_at = start_measuring_requests(kind);

            let fetch_response = self
                .client
@@ -355,15 +332,7 @@ impl RemoteStorage for S3Bucket {
                    e
                })
                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other);
-
-            let started_at = ScopeGuard::into_inner(started_at);
-
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &fetch_response, started_at);
-
-            let fetch_response = fetch_response?;
+                .map_err(DownloadError::Other)?;

            document_keys.extend(
                fetch_response
@@ -373,10 +342,10 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match fetch_response.next_continuation_token {
-                Some(new_token) => Some(new_token),
+            match fetch_response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
                None => break,
-            };
+            }
        }

        Ok(document_keys)
@@ -384,8 +353,6 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let kind = RequestKind::List;
-
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
@@ -394,9 +361,12 @@ impl RemoteStorage for S3Bucket {
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
-            let _guard = self.permit(kind).await;
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
            metrics::inc_list_objects();
-            let started_at = start_measuring_requests(kind);

            let response = self
                .client
@@ -411,14 +381,7 @@ impl RemoteStorage for S3Bucket {
                    metrics::inc_list_objects_fail();
                    e
                })
-                .context("Failed to list files in S3 bucket");
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
+                .context("Failed to list files in S3 bucket")?;

            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
@@ -440,17 +403,18 @@ impl RemoteStorage for S3Bucket {
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Put;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 upload")?;

        metrics::inc_put_object();
-        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
        let bytes_stream = ByteStream::new(SdkBody::from(body));

-        let res = self
-            .client
+        self.client
            .put_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
@@ -462,15 +426,7 @@ impl RemoteStorage for S3Bucket {
            .map_err(|e| {
                metrics::inc_put_object_fail();
                e
-            });
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
+            })?;
        Ok(())
    }

@@ -507,8 +463,11 @@ impl RemoteStorage for S3Bucket {
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;

        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
@@ -520,7 +479,6 @@ impl RemoteStorage for S3Bucket {

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
            metrics::inc_delete_objects(chunk.len() as u64);
-            let started_at = start_measuring_requests(kind);

            let resp = self
                .client
@@ -530,11 +488,6 @@ impl RemoteStorage for S3Bucket {
                .send()
                .await;

-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
            match resp {
                Ok(resp) => {
                    if let Some(errors) = resp.errors {
@@ -555,14 +508,15 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;

        metrics::inc_delete_object();
-        let started_at = start_measuring_requests(kind);

-        let res = self
-            .client
+        self.client
            .delete_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(path))
@@ -571,41 +525,11 @@ impl RemoteStorage for S3Bucket {
            .map_err(|e| {
                metrics::inc_delete_object_fail();
                e
-            });
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
+            })?;
        Ok(())
    }
 }

-/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
-fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
-fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
 #[cfg(test)]
 mod tests {
    use std::num::NonZeroUsize;
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -1,243 +0,0 @@
-use metrics::{register_histogram_vec, register_int_counter_vec, Histogram, IntCounter};
-use once_cell::sync::Lazy;
-
-pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
-
-#[derive(Clone, Copy, Debug)]
-pub(super) enum RequestKind {
-    Get = 0,
-    Put = 1,
-    Delete = 2,
-    List = 3,
-}
-
-use RequestKind::*;
-
-impl RequestKind {
-    const fn as_str(&self) -> &'static str {
-        match self {
-            Get => "get_object",
-            Put => "put_object",
-            Delete => "delete_object",
-            List => "list_objects",
-        }
-    }
-    const fn as_index(&self) -> usize {
-        *self as usize
-    }
-}
-
-pub(super) struct RequestTyped<C>([C; 4]);
-
-impl<C> RequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind) -> &C {
-        &self.0[kind.as_index()]
-    }
-
-    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
-        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List].into_iter();
-        let arr = std::array::from_fn::<C, 4, _>(|index| {
-            let next = it.next().unwrap();
-            assert_eq!(index, next.as_index());
-            f(next)
-        });
-
-        if let Some(next) = it.next() {
-            panic!("unexpected {next:?}");
-        }
-
-        RequestTyped(arr)
-    }
-}
-
-impl RequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
-        self.get(kind).observe(started_at.elapsed().as_secs_f64())
-    }
-}
-
-pub(super) struct PassFailCancelledRequestTyped<C> {
-    success: RequestTyped<C>,
-    fail: RequestTyped<C>,
-    cancelled: RequestTyped<C>,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(super) enum AttemptOutcome {
-    Ok,
-    Err,
-    Cancelled,
-}
-
-impl<T, E> From<&Result<T, E>> for AttemptOutcome {
-    fn from(value: &Result<T, E>) -> Self {
-        match value {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        }
-    }
-}
-
-impl AttemptOutcome {
-    pub(super) fn as_str(&self) -> &'static str {
-        match self {
-            AttemptOutcome::Ok => "ok",
-            AttemptOutcome::Err => "err",
-            AttemptOutcome::Cancelled => "cancelled",
-        }
-    }
-}
-
-impl<C> PassFailCancelledRequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
-        let target = match outcome {
-            AttemptOutcome::Ok => &self.success,
-            AttemptOutcome::Err => &self.fail,
-            AttemptOutcome::Cancelled => &self.cancelled,
-        };
-        target.get(kind)
-    }
-
-    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
-        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
-        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
-        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
-
-        PassFailCancelledRequestTyped {
-            success,
-            fail,
-            cancelled,
-        }
-    }
-}
-
-impl PassFailCancelledRequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(
-        &self,
-        kind: RequestKind,
-        outcome: impl Into<AttemptOutcome>,
-        started_at: std::time::Instant,
-    ) {
-        self.get(kind, outcome.into())
-            .observe(started_at.elapsed().as_secs_f64())
-    }
-}
-
-pub(super) struct BucketMetrics {
-    /// Total requests attempted
-    // TODO: remove after next release and migrate dashboards to `sum by (result) (remote_storage_s3_requests_count)`
-    requests: RequestTyped<IntCounter>,
-    /// Subset of attempted requests failed
-    // TODO: remove after next release and migrate dashboards to `remote_storage_s3_requests_count{result="err"}`
-    failed: RequestTyped<IntCounter>,
-
-    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
-    pub(super) wait_seconds: RequestTyped<Histogram>,
-
-    /// Track how many semaphore awaits were cancelled per request type.
-    ///
-    /// This is in case cancellations are happening more than expected.
-    pub(super) cancelled_waits: RequestTyped<IntCounter>,
-}
-
-impl Default for BucketMetrics {
-    fn default() -> Self {
-        let requests = register_int_counter_vec!(
-            "remote_storage_s3_requests_count",
-            "Number of s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let requests =
-            RequestTyped::build_with(|kind| requests.with_label_values(&[kind.as_str()]));
-
-        let failed = register_int_counter_vec!(
-            "remote_storage_s3_failures_count",
-            "Number of failed s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let failed = RequestTyped::build_with(|kind| failed.with_label_values(&[kind.as_str()]));
-
-        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
-
-        let req_seconds = register_histogram_vec!(
-            "remote_storage_s3_request_seconds",
-            "Seconds to complete a request",
-            &["request_type", "result"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
-            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
-        });
-
-        let wait_seconds = register_histogram_vec!(
-            "remote_storage_s3_wait_seconds",
-            "Seconds rate limited",
-            &["request_type"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let wait_seconds =
-            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
-
-        let cancelled_waits = register_int_counter_vec!(
-            "remote_storage_s3_cancelled_waits_total",
-            "Times a semaphore wait has been cancelled per request type",
-            &["request_type"],
-        )
-        .unwrap();
-        let cancelled_waits =
-            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
-
-        Self {
-            requests,
-            failed,
-            req_seconds,
-            wait_seconds,
-            cancelled_waits,
-        }
-    }
-}
-
-pub fn inc_get_object() {
-    BUCKET_METRICS.requests.get(Get).inc()
-}
-
-pub fn inc_get_object_fail() {
-    BUCKET_METRICS.failed.get(Get).inc()
-}
-
-pub fn inc_put_object() {
-    BUCKET_METRICS.requests.get(Put).inc()
-}
-
-pub fn inc_put_object_fail() {
-    BUCKET_METRICS.failed.get(Put).inc()
-}
-
-pub fn inc_delete_object() {
-    BUCKET_METRICS.requests.get(Delete).inc()
-}
-
-pub fn inc_delete_objects(count: u64) {
-    BUCKET_METRICS.requests.get(Delete).inc_by(count)
-}
-
-pub fn inc_delete_object_fail() {
-    BUCKET_METRICS.failed.get(Delete).inc()
-}
-
-pub fn inc_delete_objects_fail(count: u64) {
-    BUCKET_METRICS.failed.get(Delete).inc_by(count)
-}
-
-pub fn inc_list_objects() {
-    BUCKET_METRICS.requests.get(List).inc()
-}
-
-pub fn inc_list_objects_fail() {
-    BUCKET_METRICS.failed.get(List).inc()
-}
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -23,7 +23,6 @@
 //!      <https://grafana.com/tutorials/build-a-panel-plugin/>
 use anyhow::Result;
 use pageserver::repository::Key;
-use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -72,10 +71,6 @@ pub fn main() -> Result<()> {
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
-            // Don't try and parse "metadata" like a key-lsn range
-            continue;
-        }
        let range = parse_filename(filename);
        ranges.push(range);
    }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -107,25 +107,23 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
    let mut prev_key: Option<Key> = None;
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, _value| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                if let Some(prev) = prev_key {
-                    if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
-                        heap.push(Hole(prev..curr));
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, _value| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            if let Some(prev) = prev_key {
+                if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
+                    heap.push(Hole(prev..curr));
+                    if heap.len() > max_holes {
+                        heap.pop(); // remove smallest hole
                    }
                }
-                prev_key = Some(curr.next());
-                true
-            },
-        )
-        .await?;
+            }
+            prev_key = Some(curr.next());
+            true
+        },
+    )?;
    let mut holes = heap.into_vec();
    holes.sort_by_key(|hole| hole.0.start);
    Ok(holes)
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,17 +59,15 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value_offset| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                all.push((curr, BlobRef(value_offset)));
-                true
-            },
-        )
-        .await?;
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, value_offset| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            all.push((curr, BlobRef(value_offset)));
+            true
+        },
+    )?;
    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,10 +9,8 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
-use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
-use tokio::time::Instant;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -40,6 +38,8 @@ const PID_FILE_NAME: &str = "pageserver.pid";
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
+    #[cfg(feature = "fail/failpoints")]
+    "fail/failpoints",
 ];

 fn version() -> String {
@@ -226,19 +226,6 @@ fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
-    // Monotonic time for later calculating startup duration
-    let started_startup_at = Instant::now();
-
-    let startup_checkpoint = move |phase: &str, human_phase: &str| {
-        let elapsed = started_startup_at.elapsed();
-        let secs = elapsed.as_secs_f64();
-        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "{human_phase} ({secs:.3}s since start)"
-        )
-    };
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -348,11 +335,6 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

-    // Up to this point no significant I/O has been done: this should have been fast.  Record
-    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint("initial", "Starting loading tenants");
-    STARTUP_IS_LOADING.set(1);
-
    // Startup staging or optimizing:
    //
    // We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -378,6 +360,7 @@ fn start_pageserver(
    };

    // Scan the local 'tenants/' directory and start loading the tenants
+    let init_started_at = std::time::Instant::now();
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -395,13 +378,18 @@ fn start_pageserver(
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

            init_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load", "Initial load completed");
-            STARTUP_IS_LOADING.set(0);
-
            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

+            let init_done = std::time::Instant::now();
+            let elapsed = init_done - init_started_at;
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed"
+            );
+
            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

            let timeout = conf.background_task_maximum_delay;
@@ -410,7 +398,12 @@ fn start_pageserver(

            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
                Ok(_) => {
-                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
+                    let now = std::time::Instant::now();
+                    tracing::info!(
+                        from_init_done_millis = (now - init_done).as_millis(),
+                        from_init_millis = (now - init_started_at).as_millis(),
+                        "Initial logical sizes completed"
+                    );
                    None
                }
                Err(_) => {
@@ -426,7 +419,6 @@ fn start_pageserver(

            // allow background jobs to start
            drop(background_jobs_can_start);
-            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

            if let Some(init_sizes_done) = init_sizes_done {
                // ending up here is not a bug; at the latest logical sizes will be queried by
@@ -436,11 +428,14 @@ fn start_pageserver(

                scopeguard::ScopeGuard::into_inner(guard);

-                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+                let now = std::time::Instant::now();
+                tracing::info!(
+                    from_init_done_millis = (now - init_done).as_millis(),
+                    from_init_millis = (now - init_started_at).as_millis(),
+                    "Initial logical sizes completed after timeout (background jobs already started)"
+                );

            }
-
-            startup_checkpoint("complete", "Startup complete");
        };

        async move {
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
-pub mod metrics;
+pub(crate) mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
@@ -47,50 +47,24 @@ pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
-    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
-        "shutdown LibpqEndpointListener",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;

    // Shut down any page service tasks.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
-        "shutdown PageRequestHandlers",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;

    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
-    timed(
-        tenant::mgr::shutdown_all_tenants(),
-        "shutdown all tenants",
-        Duration::from_secs(5),
-    )
-    .await;
+    tenant::mgr::shutdown_all_tenants().await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
-        "shutdown http",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;

    // There should be nothing left, but let's be sure
-    timed(
-        task_mgr::shutdown_tasks(None, None, None),
-        "shutdown leftovers",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(None, None, None).await;
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
@@ -198,45 +172,6 @@ pub struct InitializationOrder {
    pub background_jobs_can_start: utils::completion::Barrier,
 }

-/// Time the future with a warning when it exceeds a threshold.
-async fn timed<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_at: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    match tokio::time::timeout(warn_at, &mut fut).await {
-        Ok(ret) => {
-            tracing::info!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "completed"
-            );
-            ret
-        }
-        Err(_) => {
-            tracing::info!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "still waiting, taking longer than expected..."
-            );
-
-            let ret = fut.await;
-
-            tracing::warn!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "completed, took longer than expected"
-            );
-
-            ret
-        }
-    }
-}
-
 #[cfg(test)]
 mod backoff_defaults_tests {
    use super::*;
@@ -267,36 +202,3 @@ mod backoff_defaults_tests {
        );
    }
 }
-
-#[cfg(test)]
-mod timed_tests {
-    use super::timed;
-    use std::time::Duration;
-
-    #[tokio::test]
-    async fn timed_completes_when_inner_future_completes() {
-        // A future that completes on time should have its result returned
-        let r1 = timed(
-            async move {
-                tokio::time::sleep(Duration::from_millis(10)).await;
-                123
-            },
-            "test 1",
-            Duration::from_millis(50),
-        )
-        .await;
-        assert_eq!(r1, 123);
-
-        // A future that completes too slowly should also have its result returned
-        let r1 = timed(
-            async move {
-                tokio::time::sleep(Duration::from_millis(50)).await;
-                456
-            },
-            "test 1",
-            Duration::from_millis(10),
-        )
-        .await;
-        assert_eq!(r1, 456);
-    }
-}
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
-    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
-    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
-    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
-    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
+    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use strum::VariantNames;
@@ -394,35 +394,6 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

-/// How long did we take to start up?  Broken down by labels to describe
-/// different phases of startup.
-pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_startup_duration_seconds",
-        "Time taken by phases of pageserver startup, in seconds",
-        &["phase"]
-    )
-    .expect("Failed to register pageserver_startup_duration_seconds metric")
-});
-
-pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_startup_is_loading",
-        "1 while in initial startup load of tenants, 0 at other times"
-    )
-    .expect("Failed to register pageserver_startup_is_loading")
-});
-
-/// How long did tenants take to go from construction to active state?
-pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_tenant_activation_seconds",
-        "Time taken by tenants to activate, in seconds",
-        CRITICAL_OP_BUCKETS.into()
-    )
-    .expect("Failed to register pageserver_tenant_activation_seconds metric")
-});
-
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -56,7 +56,6 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::TENANT_ACTIVATION;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -600,7 +599,10 @@ impl Tenant {
            debug!("successfully downloaded index part for timeline {timeline_id}");
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
-                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
+                    timeline_ancestors.insert(
+                        timeline_id,
+                        index_part.parse_metadata().context("parse_metadata")?,
+                    );
                    remote_index_and_client.insert(timeline_id, (index_part, client));
                }
                MaybeDeletedIndexPart::Deleted(_) => {
@@ -1125,7 +1127,10 @@ impl Tenant {
                        }
                    };

-                    let remote_metadata = index_part.metadata.clone();
+                    let remote_metadata = index_part
+                        .parse_metadata()
+                        .context("parse_metadata")
+                        .map_err(LoadLocalTimelineError::Load)?;
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -1634,8 +1639,6 @@ impl Tenant {
                    post_state = <&'static str>::from(&*current_state),
                    "activation attempt finished"
                );
-
-                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -2073,57 +2076,74 @@ impl Tenant {
    ) -> Tenant {
        let (state, mut rx) = watch::channel(state);

-        tokio::spawn(async move {
-            let tid = tenant_id.to_string();
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            Some(tenant_id),
+            None,
+            &format!("state metrics collector for tenant {tenant_id}"),
+            false,
+            async move {
+                let cancel = task_mgr::shutdown_token();

-            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
-                ([state.into()], matches!(state, TenantState::Broken { .. }))
-            }
+                let tid = tenant_id.to_string();

-            let mut tuple = inspect_state(&rx.borrow_and_update());
-
-            let is_broken = tuple.1;
-            let mut counted_broken = if !is_broken {
-                // the tenant might be ignored and reloaded, so first remove any previous set
-                // element. it most likely has already been scraped, as these are manual operations
-                // right now. most likely we will add it back very soon.
-                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
-                false
-            } else {
-                // add the id to the set right away, there should not be any updates on the channel
-                // after
-                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid])
-                    .set(1);
-                true
-            };
-
-            loop {
-                let labels = &tuple.0;
-                let current = TENANT_STATE_METRIC.with_label_values(labels);
-                current.inc();
-
-                if rx.changed().await.is_err() {
-                    // tenant has been dropped; decrement the counter because a tenant with that
-                    // state is no longer in tenant map, but allow any broken set item to exist
-                    // still.
-                    current.dec();
-                    break;
+                fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
+                    ([state.into()], matches!(state, TenantState::Broken { .. }))
                }

-                current.dec();
-                tuple = inspect_state(&rx.borrow_and_update());
+                let mut tuple = inspect_state(&rx.borrow_and_update());

                let is_broken = tuple.1;
-                if is_broken && !counted_broken {
-                    counted_broken = true;
-                    // insert the tenant_id (back) into the set
+                let mut counted_broken = if !is_broken {
+                    // the tenant might be ignored and reloaded, so first remove any previous set
+                    // element. it most likely has already been scraped, as these are manual operations
+                    // right now. most likely we will add it back very soon.
+                    drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
+                    false
+                } else {
+                    // add the id to the set right away, there should not be any updates on the channel
+                    // after
                    crate::metrics::BROKEN_TENANTS_SET
                        .with_label_values(&[&tid])
-                        .inc();
+                        .set(1);
+                    true
+                };
+
+                loop {
+                    let labels = &tuple.0;
+                    let current = TENANT_STATE_METRIC.with_label_values(labels);
+                    current.inc();
+
+                    let changed = tokio::select! {
+                        changed = rx.changed() => {changed},
+                        _ = cancel.cancelled() => {return Ok(())}
+                    };
+
+                    if changed.is_err() {
+                        // tenant has been dropped; decrement the counter because a tenant with that
+                        // state is no longer in tenant map, but allow any broken set item to exist
+                        // still.
+                        current.dec();
+                        break;
+                    }
+
+                    current.dec();
+                    tuple = inspect_state(&rx.borrow_and_update());
+
+                    let is_broken = tuple.1;
+                    if is_broken && !counted_broken {
+                        counted_broken = true;
+                        // insert the tenant_id (back) into the set
+                        crate::metrics::BROKEN_TENANTS_SET
+                            .with_label_values(&[&tid])
+                            .inc();
+                    }
                }
+                Ok(())
            }
-        });
+            .instrument(info_span!("state_metrics", tenant_id = %tenant_id)),
+        );

        Tenant {
            tenant_id,
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -230,15 +230,14 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
+    pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
        self.visit(search_key, VisitDirection::Forwards, |key, value| {
            if key == search_key {
                result = Some(value);
            }
            false
-        })
-        .await?;
+        })?;
        Ok(result)
    }

@@ -247,7 +246,7 @@ where
    /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
    /// backwards)
    ///
-    pub async fn visit<V>(
+    pub fn visit<V>(
        &self,
        search_key: &[u8; L],
        dir: VisitDirection,
@@ -270,9 +269,23 @@ where
        V: FnMut(&[u8], u64) -> bool,
    {
        // Locate the node.
-        let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+        let blk = self.reader.read_blk(self.start_blk + node_blknum)?;

-        let node = OnDiskNode::deparse(node_buf.as_ref())?;
+        // Search all entries on this node
+        self.search_node(blk.as_ref(), search_key, dir, visitor)
+    }
+
+    fn search_node<V>(
+        &self,
+        node_buf: &[u8],
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        visitor: &mut V,
+    ) -> Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        let node = OnDiskNode::deparse(node_buf)?;
        let prefix_len = node.prefix_len as usize;
        let suffix_len = node.suffix_len as usize;

@@ -769,12 +782,12 @@ mod tests {

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
-            assert_eq!(reader.get(key).await?, Some(*val));
+            assert_eq!(reader.get(key)?, Some(*val));
        }
        // And on some keys that don't exist
-        assert_eq!(reader.get(b"aaaaaa").await?, None);
-        assert_eq!(reader.get(b"zzzzzz").await?, None);
-        assert_eq!(reader.get(b"xaaabx").await?, None);
+        assert_eq!(reader.get(b"aaaaaa")?, None);
+        assert_eq!(reader.get(b"zzzzzz")?, None);
+        assert_eq!(reader.get(b"xaaabx")?, None);

        // Test search with `visit` function
        let search_key = b"xabaaa";
@@ -785,12 +798,10 @@ mod tests {
            .collect();

        let mut data = Vec::new();
-        reader
-            .visit(search_key, VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(search_key, VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        // Test a backwards scan
@@ -801,20 +812,16 @@ mod tests {
            .collect();
        expected.reverse();
        let mut data = Vec::new();
-        reader
-            .visit(search_key, VisitDirection::Backwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(search_key, VisitDirection::Backwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        // Backward scan where nothing matches
-        reader
-            .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
-                panic!("found unexpected key {}: {}", hex::encode(key), value);
-            })
-            .await?;
+        reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
+            panic!("found unexpected key {}: {}", hex::encode(key), value);
+        })?;

        // Full scan
        let expected: Vec<(Vec<u8>, u64)> = all_data
@@ -822,12 +829,10 @@ mod tests {
            .map(|(key, value)| (key.to_vec(), *value))
            .collect();
        let mut data = Vec::new();
-        reader
-            .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        Ok(())
@@ -875,15 +880,13 @@ mod tests {
        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
            let search_key = u64::to_be_bytes(search_key_int);
            assert_eq!(
-                reader.get(&search_key).await?,
+                reader.get(&search_key)?,
                all_data.get(&search_key_int).cloned()
            );

            // Test a forward scan starting with this key
            result.lock().unwrap().clear();
-            reader
-                .visit(&search_key, VisitDirection::Forwards, take_ten)
-                .await?;
+            reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
            let expected = all_data
                .range(search_key_int..)
                .take(10)
@@ -893,9 +896,7 @@ mod tests {

            // And a backwards scan
            result.lock().unwrap().clear();
-            reader
-                .visit(&search_key, VisitDirection::Backwards, take_ten)
-                .await?;
+            reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
            let expected = all_data
                .range(..=search_key_int)
                .rev()
@@ -909,9 +910,7 @@ mod tests {
        let search_key = u64::to_be_bytes(0);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader
-            .visit(&search_key, VisitDirection::Forwards, take_ten)
-            .await?;
+        reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
        let expected = all_data
            .iter()
            .map(|(&key, &val)| (key, val))
@@ -922,9 +921,7 @@ mod tests {
        let search_key = u64::to_be_bytes(u64::MAX);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader
-            .visit(&search_key, VisitDirection::Backwards, take_ten)
-            .await?;
+        reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
        let expected = all_data
            .iter()
            .rev()
@@ -935,8 +932,8 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn random_data() -> Result<()> {
+    #[test]
+    fn random_data() -> Result<()> {
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -963,23 +960,19 @@ mod tests {
        // Test get() operation on all the keys
        for (&key, &val) in all_data.iter() {
            let search_key = u128::to_be_bytes(key);
-            assert_eq!(reader.get(&search_key).await?, Some(val));
+            assert_eq!(reader.get(&search_key)?, Some(val));
        }

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
            let key_int = rand::thread_rng().gen::<u128>();
            let search_key = u128::to_be_bytes(key_int);
-            assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
+            assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
        }

        // Test boundary cases
-        assert!(
-            reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
-        );
-        assert!(
-            reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
-        );
+        assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
+        assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());

        Ok(())
    }
@@ -1021,17 +1014,15 @@ mod tests {

        // Test get() operation on all the keys
        for (key, val) in disk_btree_test_data::TEST_DATA {
-            assert_eq!(reader.get(&key).await?, Some(val));
+            assert_eq!(reader.get(&key)?, Some(val));
        }

        // Test full scan
        let mut count = 0;
-        reader
-            .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
-                count += 1;
-                true
-            })
-            .await?;
+        reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
+            count += 1;
+            true
+        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

        reader.dump().await?;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -266,17 +266,11 @@ impl Drop for EphemeralFile {
        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.file.path.display(),
-                    e
-                );
-            }
+            warn!(
+                "could not remove ephemeral file '{}': {}",
+                self.file.path.display(),
+                e
+            );
        }
    }
 }
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
 use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
-use serde::{de::Error, Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
@@ -232,28 +232,6 @@ impl TimelineMetadata {
    }
 }

-impl<'de> Deserialize<'de> for TimelineMetadata {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{}", e)))
-    }
-}
-
-impl Serialize for TimelineMetadata {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        let bytes = self
-            .to_bytes()
-            .map_err(|e| serde::ser::Error::custom(format!("{}", e)))?;
-        bytes.serialize(serializer)
-    }
-}
-
 /// Save timeline metadata to file
 pub fn save_metadata(
    conf: &'static PageServerConf,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -266,77 +266,71 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
        }
    };

-    let started_at = std::time::Instant::now();
    let mut join_set = JoinSet::new();
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                let freeze_and_flush = true;
+                // ordering shouldn't matter for this, either we store true right away or never
+                let ordering = std::sync::atomic::Ordering::Relaxed;
+                let joined_other = std::sync::atomic::AtomicBool::new(false);

-                let res = {
-                    let (_guard, shutdown_progress) = completion::channel();
-                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
+                let mut shutdown = std::pin::pin!(async {
+                    let freeze_and_flush = true;
+
+                    let res = {
+                        let (_guard, shutdown_progress) = completion::channel();
+                        tenant.shutdown(shutdown_progress, freeze_and_flush).await
+                    };
+
+                    if let Err(other_progress) = res {
+                        // join the another shutdown in progress
+                        joined_other.store(true, ordering);
+                        other_progress.wait().await;
+                    }
+                });
+
+                // in practice we might not have a lot time to go, since systemd is going to
+                // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
+                // a warning.
+                let warning = std::time::Duration::from_secs(5);
+                let mut warning = std::pin::pin!(tokio::time::sleep(warning));
+
+                tokio::select! {
+                    _ = &mut shutdown => {},
+                    _ = &mut warning => {
+                        let joined_other = joined_other.load(ordering);
+                        warn!(%joined_other, "waiting for the shutdown to complete");
+                        shutdown.await;
+                    }
                };

-                if let Err(other_progress) = res {
-                    // join the another shutdown in progress
-                    other_progress.wait().await;
-                }
-
-                // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                // going to log too many lines
-
                debug!("tenant successfully stopped");
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
    }

-    let total = join_set.len();
    let mut panicked = 0;
-    let mut buffering = true;
-    const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
-    let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));

-    while !join_set.is_empty() {
-        tokio::select! {
-            Some(joined) = join_set.join_next() => {
-                match joined {
-                    Ok(()) => {}
-                    Err(join_error) if join_error.is_cancelled() => {
-                        unreachable!("we are not cancelling any of the futures");
-                    }
-                    Err(join_error) if join_error.is_panic() => {
-                        // cannot really do anything, as this panic is likely a bug
-                        panicked += 1;
-                    }
-                    Err(join_error) => {
-                        warn!("unknown kind of JoinError: {join_error}");
-                    }
-                }
-                if !buffering {
-                    // buffer so that every 500ms since the first update (or starting) we'll log
-                    // how far away we are; this is because we will get SIGKILL'd at 10s, and we
-                    // are not able to log *then*.
-                    buffering = true;
-                    buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
-                }
-            },
-            _ = &mut buffered, if buffering => {
-                buffering = false;
-                info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
            }
        }
    }

    if panicked > 0 {
-        warn!(
-            panicked,
-            total, "observed panicks while shutting down tenants"
-        );
+        warn!(panicked, "observed panicks while shutting down tenants");
    }
-
-    // caller will log how long we took
 }

 pub async fn create_tenant(
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -534,7 +534,8 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

-        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        self.schedule_index_upload(upload_queue, metadata_bytes);

        Ok(())
    }
@@ -554,7 +555,8 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+            self.schedule_index_upload(upload_queue, metadata_bytes);
        }

        Ok(())
@@ -564,7 +566,7 @@ impl RemoteTimelineClient {
    fn schedule_index_upload(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        metadata: TimelineMetadata,
+        metadata_bytes: Vec<u8>,
    ) {
        info!(
            "scheduling metadata upload with {} files ({} changed)",
@@ -574,7 +576,11 @@ impl RemoteTimelineClient {

        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();

-        let index_part = IndexPart::new(upload_queue.latest_files.clone(), metadata);
+        let index_part = IndexPart::new(
+            upload_queue.latest_files.clone(),
+            disk_consistent_lsn,
+            metadata_bytes,
+        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -629,7 +635,7 @@ impl RemoteTimelineClient {

        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
-        let metadata = upload_queue.latest_metadata.clone();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;

        // Update the remote index file, removing the to-be-deleted files from the index,
        // before deleting the actual files.
@@ -645,7 +651,7 @@ impl RemoteTimelineClient {
            }

            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata);
+                self.schedule_index_upload(upload_queue, metadata_bytes);
            }

            // schedule the actual deletions
@@ -1536,17 +1542,14 @@ mod tests {
        };

        assert_file_list(
-            &index_part
-                .layer_metadata
-                .keys()
-                .map(|f| f.to_owned())
-                .collect(),
+            &index_part.timeline_layers,
            &[
                &layer_file_name_1.file_name(),
                &layer_file_name_2.file_name(),
            ],
        );
-        assert_eq!(index_part.metadata, metadata);
+        let downloaded_metadata = index_part.parse_metadata()?;
+        assert_eq!(downloaded_metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        let content_baz = dummy_contents("baz");
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -259,19 +259,13 @@ pub(super) async fn download_index_part(
    )
    .await?;

-    let decode_result = serde_json::from_slice::<IndexPart>(&index_part_bytes)
+    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
        .with_context(|| {
            format!("Failed to deserialize index part file into file {index_part_path:?}")
        })
-        .map_err(DownloadError::Other);
+        .map_err(DownloadError::Other)?;

-    // Peek at the result, and log the original bytes if they failed to decode
-    if decode_result.is_err() {
-        let index_str = String::from_utf8_lossy(index_part_bytes.as_slice());
-        warn!("Corrupt index bytes: {index_str}");
-    }
-
-    decode_result
+    Ok(index_part)
 }

 ///
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -5,15 +5,16 @@
 use std::collections::{HashMap, HashSet};

 use chrono::NaiveDateTime;
-use serde::ser::SerializeStruct;
-use serde::{Deserialize, Serialize, Serializer};
-use serde_with::serde_as;
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::bin_ser::SerializeError;

 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::upload_queue::UploadQueueInitialized;

+use utils::lsn::Lsn;
+
 /// Metadata gathered for each of the layer files.
 ///
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -50,82 +51,33 @@ impl LayerFileMetadata {
 ///
 /// This type needs to be backwards and forwards compatible. When changing the fields,
 /// remember to add a test case for the changed version.
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[serde_as]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexPart {
+    /// Debugging aid describing the version of this type.
+    #[serde(default)]
    version: usize,

+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

+    /// Layer names, which are stored on the remote storage.
+    ///
+    /// Additional metadata can might exist in `layer_metadata`.
+    pub timeline_layers: HashSet<LayerFileName>,
+
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,

-    pub metadata: TimelineMetadata,
-}
-
-impl<'de> Deserialize<'de> for IndexPart {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        // Declaring a struct is simpler that implementing a Visitor to handle decoding
-        // a JSON struct while ignoring fields we don't care about.
-        #[serde_as]
-        #[derive(Deserialize)]
-        struct SerializedIndexPart {
-            #[serde(default)]
-            version: usize,
-            #[serde(default)]
-            deleted_at: Option<NaiveDateTime>,
-            layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
-            metadata_bytes: TimelineMetadata,
-        }
-
-        let inner = SerializedIndexPart::deserialize(deserializer)?;
-
-        Ok(IndexPart {
-            version: inner.version,
-            deleted_at: inner.deleted_at,
-            layer_metadata: inner.layer_metadata,
-            metadata: inner.metadata_bytes,
-        })
-    }
-}
-
-impl Serialize for IndexPart {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        let mut state = serializer.serialize_struct("IndexPart", 6)?;
-
-        state.serialize_field("version", &(self.version as u32))?;
-
-        // Forward compat: write out this field only so that v2 readers can read
-        // the v3 structure.  This could be written more efficiently but this forward
-        // compat code will go away in the near future.
-        let timeline_layers: HashSet<LayerFileName> =
-            self.layer_metadata.keys().map(|k| k.to_owned()).collect();
-        state.serialize_field("timeline_layers", &timeline_layers)?;
-
-        state.serialize_field("deleted_at", &self.deleted_at)?;
-        state.serialize_field("layer_metadata", &self.layer_metadata)?;
-        let metadata_bytes = self.metadata.to_bytes().map_err(|e| {
-            serde::ser::Error::custom(format!("Unserializable IndexPart metadata: {e}"))
-        })?;
-        state.serialize_field("metadata_bytes", &metadata_bytes)?;
-
-        // This field is written out for convenience of human readers, but is
-        // not read back in deserialization
-        state.serialize_field(
-            "disk_consistent_lsn",
-            &format!("{}", self.metadata.disk_consistent_lsn()),
-        )?;
-
-        state.end()
-    }
+    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
+    // It's duplicated here for convenience.
+    #[serde_as(as = "DisplayFromStr")]
+    pub disk_consistent_lsn: Lsn,
+    metadata_bytes: Vec<u8>,
 }

 impl IndexPart {
@@ -138,30 +90,44 @@ impl IndexPart {

    pub fn new(
        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
-        metadata: TimelineMetadata,
+        disk_consistent_lsn: Lsn,
+        metadata_bytes: Vec<u8>,
    ) -> Self {
+        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());

-        for (remote_name, metadata) in layers_and_metadata {
-            layer_metadata.insert(remote_name.to_owned(), IndexLayerMetadata::from(metadata));
+        for (remote_name, metadata) in &layers_and_metadata {
+            timeline_layers.insert(remote_name.to_owned());
+            let metadata = IndexLayerMetadata::from(metadata);
+            layer_metadata.insert(remote_name.to_owned(), metadata);
        }

        Self {
            version: Self::LATEST_VERSION,
+            timeline_layers,
            layer_metadata,
-            metadata,
+            disk_consistent_lsn,
+            metadata_bytes,
            deleted_at: None,
        }
    }
+
+    pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
+        TimelineMetadata::from_bytes(&self.metadata_bytes)
+    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
    type Error = SerializeError;

    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+
        Ok(Self::new(
            upload_queue.latest_files.clone(),
-            upload_queue.latest_metadata.clone(),
+            disk_consistent_lsn,
+            metadata_bytes,
        ))
    }
 }
@@ -172,8 +138,8 @@ pub struct IndexLayerMetadata {
    pub(super) file_size: u64,
 }

-impl From<LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: LayerFileMetadata) -> Self {
+impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &'_ LayerFileMetadata) -> Self {
        IndexLayerMetadata {
            file_size: other.file_size,
        }
@@ -186,48 +152,21 @@ mod tests {

    #[test]
    fn v1_indexpart_is_parsed() {
-        let metadata_bytes: Vec<u8> = [
-            113, 11, 159, 210, 0, 54, 0, 4, 0, 0, 0, 0, 1, 105, 96, 232, 1, 0, 0, 0, 0, 1, 105, 96,
-            112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 105, 96, 112, 0, 0, 0, 0, 1, 105, 96,
-            112, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        ]
-        .to_vec();
-        let metadata_bytes_str = serde_json::to_string(&metadata_bytes).unwrap();
-
-        let example = format!(
-            r#"{{
+        let example = r#"{
            "version":1,
-            "timeline_layers":[
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9",
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51"
-                ],
-            "layer_metadata":{{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": {{ "file_size": 25600000 }},
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": {{ "file_size": 9007199254741001 }}
-            }},
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":{metadata_bytes_str}
-        }}"#
-        );
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;

        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -238,61 +177,71 @@ mod tests {
                    file_size: 9007199254741001,
                })
            ]),
-            metadata: TimelineMetadata::from_bytes(&metadata_bytes).unwrap(),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: None,
        };

-        let part = serde_json::from_str::<IndexPart>(&example).unwrap();
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v1_indexpart_is_parsed_with_optional_missing_layers() {
+        let example = r#"{
+            "version":1,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 1,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
        assert_eq!(part, expected);
    }

    #[test]
    fn v2_indexpart_is_parsed_with_deleted_at() {
-        let metadata_bytes: Vec<u8> = [
-            136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83, 38,
-            32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255, 210, 0, 0,
-            0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0,
-            15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]
-        .to_vec();
-        let metadata_bytes_str = serde_json::to_string(&metadata_bytes).unwrap();
-
-        let example = format!(
-            r#"{{
+        let example = r#"{
            "version":2,
-            "timeline_layers":[
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9",
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51"
-                ],
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
            "missing_layers":["This shouldn't fail deserialization"],
-            "layer_metadata":{{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": {{ "file_size": 25600000 }},
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": {{ "file_size": 9007199254741001 }}
-            }},
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":{metadata_bytes_str},
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            "deleted_at": "2023-07-31T09:00:00.123"
-        }}"#
-        );
+        }"#;

        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -303,131 +252,58 @@ mod tests {
                    file_size: 9007199254741001,
                })
            ]),
-            metadata: TimelineMetadata::from_bytes(&metadata_bytes).unwrap(),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };

-        let part = serde_json::from_str::<IndexPart>(&example).unwrap();
-        assert_eq!(part, expected);
-
-        // Validate that when we write out, we are writing the same v2 format that older pageservers
-        // will understand
-        let reserialized = serde_json::to_string(&part).unwrap();
-
-        // We do not expect exact symmetry, but the reserialized version should include the legacy fields that
-        // v2 requires, and not be limited to just the fields that are in the runtime IndexPart
-        assert!(reserialized.contains("layer_metadata"));
-        assert!(reserialized.contains("disk_consistent_lsn"));
-        // The missing_layers attribute is not required
-        assert!(!reserialized.contains("missing_layers"));
-    }
-
-    #[test]
-    fn v3_indexpart_is_parsed() {
-        let metadata_bytes: Vec<u8> = [
-            136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83, 38,
-            32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255, 210, 0, 0,
-            0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0,
-            15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]
-        .to_vec();
-        let metadata_bytes_str = serde_json::to_string(&metadata_bytes).unwrap();
-
-        let example = format!(
-            r#"{{
-            "version":3,
-            "layer_metadata":{{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": {{ "file_size": 25600000 }},
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": {{ "file_size": 9007199254741001 }}
-            }},
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":{metadata_bytes_str},
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }}"#
-        );
-
-        let expected = IndexPart {
-            version: 3,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: 25600000,
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: 9007199254741001,
-                })
-            ]),
-            metadata: TimelineMetadata::from_bytes(metadata_bytes.as_slice()).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
-        };
-
-        let part = serde_json::from_str::<IndexPart>(&example).unwrap();
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
        assert_eq!(part, expected);
    }

    #[test]
    fn empty_layers_are_parsed() {
-        let metadata_bytes: Vec<u8> = [
-            136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83, 38,
-            32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255, 210, 0, 0,
-            0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0,
-            15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0,
-        ]
-        .to_vec();
-        let metadata_bytes_str = serde_json::to_string(&metadata_bytes).unwrap();
-
-        let empty_layers_json = format!(
-            r#"{{
+        let empty_layers_json = r#"{
            "version":1,
            "timeline_layers":[],
-            "layer_metadata":{{}},
+            "layer_metadata":{},
            "disk_consistent_lsn":"0/2532648",
-            "metadata_bytes":{metadata_bytes_str}
-        }}"#
-        );
+            "metadata_bytes":[136,151,49,208,0,70,0,4,0,0,0,0,2,83,38,72,1,0,0,0,0,2,83,38,32,1,87,198,240,135,97,119,45,125,38,29,155,161,140,141,255,210,0,0,0,0,2,83,38,72,0,0,0,0,1,73,240,192,0,0,0,0,1,73,240,192,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;

        let expected = IndexPart {
            version: 1,
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
-            metadata: TimelineMetadata::from_bytes(&metadata_bytes).unwrap(),
+            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
+            metadata_bytes: [
+                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
+                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
+                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
+                240, 192, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0,
+            ]
+            .to_vec(),
            deleted_at: None,
        };

-        let empty_layers_parsed =
-            serde_json::from_str::<IndexPart>(empty_layers_json.as_str()).unwrap();
+        let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();

        assert_eq!(empty_layers_parsed, expected);
    }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -41,6 +41,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -51,7 +52,6 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -242,7 +242,7 @@ impl Layer for DeltaLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(LayerAccessKind::Dump, ctx)?;

        println!(
            "index_start_blk: {}, root {}",
@@ -281,24 +281,22 @@ impl Layer for DeltaLayer {
            Ok(desc)
        };

-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |delta_key, val| {
-                    let blob_ref = BlobRef(val);
-                    let key = DeltaKey::extract_key_from_buf(delta_key);
-                    let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |delta_key, val| {
+                let blob_ref = BlobRef(val);
+                let key = DeltaKey::extract_key_from_buf(delta_key);
+                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);

-                    let desc = match dump_blob(blob_ref) {
-                        Ok(desc) => desc,
-                        Err(err) => format!("ERROR: {}", err),
-                    };
-                    println!("  key {} at {}: {}", key, lsn, desc);
-                    true
-                },
-            )
-            .await?;
+                let desc = match dump_blob(blob_ref) {
+                    Ok(desc) => desc,
+                    Err(err) => format!("ERROR: {}", err),
+                };
+                println!("  key {} at {}: {}", key, lsn, desc);
+                true
+            },
+        )?;

        Ok(())
    }
@@ -317,9 +315,7 @@ impl Layer for DeltaLayer {

        {
            // Open the file and lock the metadata in memory
-            let inner = self
-                .load(LayerAccessKind::GetValueReconstructData, ctx)
-                .await?;
+            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

            // Scan the page versions backwards, starting from `lsn`.
            let file = &inner.file;
@@ -332,21 +328,19 @@ impl Layer for DeltaLayer {

            let mut offsets: Vec<(Lsn, u64)> = Vec::new();

-            tree_reader
-                .visit(&search_key.0, VisitDirection::Backwards, |key, value| {
-                    let blob_ref = BlobRef(value);
-                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                        return false;
-                    }
-                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                    if entry_lsn < lsn_range.start {
-                        return false;
-                    }
-                    offsets.push((entry_lsn, blob_ref.pos()));
+            tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
+                let blob_ref = BlobRef(value);
+                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                    return false;
+                }
+                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                if entry_lsn < lsn_range.start {
+                    return false;
+                }
+                offsets.push((entry_lsn, blob_ref.pos()));

-                    !blob_ref.will_init()
-                })
-                .await?;
+                !blob_ref.will_init()
+            })?;

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
            let cursor = file.block_cursor();
@@ -499,7 +493,7 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
+    fn load(
        &self,
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
@@ -509,11 +503,10 @@ impl DeltaLayer {
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner())
-            .await
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
+    fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

        let file = VirtualFile::open(&path)
@@ -574,7 +567,7 @@ impl DeltaLayer {
                file_size,
            ),
            access_stats,
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        }
    }

@@ -601,7 +594,7 @@ impl DeltaLayer {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        })
    }

@@ -621,25 +614,19 @@ impl DeltaLayer {
    /// Obtains all keys and value references stored in the layer
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub async fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+    pub fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
-            .await
            .context("load delta layer")?;
-        DeltaLayerInner::load_val_refs(inner)
-            .await
-            .context("Layer index is corrupted")
+        DeltaLayerInner::load_val_refs(inner).context("Layer index is corrupted")
    }

    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+    pub fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
-            .await
            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
-            .await
-            .context("Layer index is corrupted")
+        inner.load_keys().context("Layer index is corrupted")
    }
 }

@@ -789,7 +776,7 @@ impl DeltaLayerWriterInner {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        };

        // fsync the file
@@ -912,7 +899,7 @@ impl Drop for DeltaLayerWriter {
 }

 impl DeltaLayerInner {
-    async fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+    fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
        let file = &this.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            this.index_start_blk,
@@ -921,25 +908,23 @@ impl DeltaLayerInner {
        );

        let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new();
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
-                    let val_ref = ValueRef {
-                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(Adapter(this.clone())),
-                    };
-                    all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
-                    true
-                },
-            )
-            .await?;
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                let delta_key = DeltaKey::from_slice(key);
+                let val_ref = ValueRef {
+                    blob_ref: BlobRef(value),
+                    reader: BlockCursor::new(Adapter(this.clone())),
+                };
+                all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
+                true
+            },
+        )?;

        Ok(all_offsets)
    }
-    async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
+    fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
        let file = &self.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
@@ -948,28 +933,26 @@ impl DeltaLayerInner {
        );

        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
-                    let pos = BlobRef(value).pos();
-                    if let Some(last) = all_keys.last_mut() {
-                        if last.0 == delta_key.key() {
-                            return true;
-                        } else {
-                            // subtract offset of new key BLOB and first blob of this key
-                            // to get total size if values associated with this key
-                            let first_pos = last.2;
-                            last.2 = pos - first_pos;
-                        }
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                let delta_key = DeltaKey::from_slice(key);
+                let pos = BlobRef(value).pos();
+                if let Some(last) = all_keys.last_mut() {
+                    if last.0 == delta_key.key() {
+                        return true;
+                    } else {
+                        // subtract offset of new key BLOB and first blob of this key
+                        // to get total size if values associated with this key
+                        let first_pos = last.2;
+                        last.2 = pos - first_pos;
                    }
-                    all_keys.push((delta_key.key(), delta_key.lsn(), pos));
-                    true
-                },
-            )
-            .await?;
+                }
+                all_keys.push((delta_key.key(), delta_key.lsn(), pos));
+                true
+            },
+        )?;
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of layer
            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -168,19 +168,17 @@ impl Layer for ImageLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(LayerAccessKind::Dump, ctx)?;
        let file = &inner.file;
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

        tree_reader.dump().await?;

-        tree_reader
-            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-                println!("key: {} offset {}", hex::encode(key), value);
-                true
-            })
-            .await?;
+        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
+            println!("key: {} offset {}", hex::encode(key), value);
+            true
+        })?;

        Ok(())
    }
@@ -197,16 +195,14 @@ impl Layer for ImageLayer {
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
+        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

        let file = &inner.file;
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf).await? {
+        if let Some(offset) = tree_reader.get(&keybuf)? {
            let blob = file.block_cursor().read_blob(offset).with_context(|| {
                format!(
                    "failed to read value from data file {} at offset {}",
@@ -316,11 +312,7 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&ImageLayerInner> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        loop {
@@ -329,12 +321,11 @@ impl ImageLayer {
            }
            self.inner
                .get_or_try_init(|| self.load_inner())
-                .await
                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
        }
    }

-    async fn load_inner(&self) -> Result<ImageLayerInner> {
+    fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

        // Open the file if it's not open already.
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -73,13 +73,17 @@ pub fn start_background_loops(
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
+    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
+            trace!("waking up");
+
            tokio::select! {
                _ = cancel.cancelled() => {
+                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -99,6 +103,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            }

+            if cancel.is_cancelled() {
+                info!("received cancellation request");
+                break;
+            }
+
            let started_at = Instant::now();

            let sleep_duration = if period == Duration::ZERO {
@@ -122,12 +131,15 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
+                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
+
+    trace!("compaction loop stopped.");
 }

 ///
@@ -135,6 +147,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
+    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -143,8 +156,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
        let mut first = true;
        loop {
+            trace!("waking up");
+
            tokio::select! {
                _ = cancel.cancelled() => {
+                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -189,12 +205,14 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
+                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
+    trace!("GC loop stopped.");
 }

 async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
@@ -219,6 +237,7 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
                    }
                }
                Err(_sender_dropped_error) => {
+                    info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
                    return ControlFlow::Break(());
                }
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,7 +19,6 @@ use pageserver_api::models::{
 use remote_storage::GenericRemoteStorage;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
-use tokio::runtime::Handle;
 use tokio::sync::{oneshot, watch, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -1722,7 +1721,7 @@ impl Timeline {

        let mut corrupted_local_layers = Vec::new();
        let mut added_remote_layers = Vec::new();
-        for remote_layer_name in index_part.layer_metadata.keys() {
+        for remote_layer_name in &index_part.timeline_layers {
            let local_layer = local_only_layers.remove(remote_layer_name);

            let remote_layer_metadata = index_part
@@ -1877,7 +1876,7 @@ impl Timeline {
            Some(index_part) => {
                info!(
                    "initializing upload queue from remote index with {} layer files",
-                    index_part.layer_metadata.len()
+                    index_part.timeline_layers.len()
                );
                remote_client.init_upload_queue(index_part)?;
                self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
@@ -3519,41 +3518,16 @@ impl Timeline {
        // min-heap (reserve space for one more element added before eviction)
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;
-
-        let mut all_value_refs = Vec::new();
-        for l in deltas_to_compact.iter() {
-            // TODO: replace this with an await once we fully go async
-            all_value_refs.extend(
-                Handle::current().block_on(
-                    l.clone()
-                        .downcast_delta_layer()
-                        .expect("delta layer")
-                        .load_val_refs(ctx),
-                )?,
-            );
-        }
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn));
-
-        let mut all_keys = Vec::new();
-        for l in deltas_to_compact.iter() {
-            // TODO: replace this with an await once we fully go async
-            all_keys.extend(
-                Handle::current().block_on(
-                    l.clone()
-                        .downcast_delta_layer()
-                        .expect("delta layer")
-                        .load_keys(ctx),
-                )?,
-            );
-        }
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn));
-
-        for (next_key, _next_lsn, _size) in all_keys.iter() {
-            let next_key = *next_key;
+        for (next_key, _next_lsn, _size) in itertools::process_results(
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
+            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
+        )? {
            if let Some(prev_key) = prev {
                // just first fast filter
                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
@@ -3586,10 +3560,40 @@ impl Timeline {

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
-        let all_values_iter = all_value_refs.into_iter();
+        let all_values_iter = itertools::process_results(
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_val_refs(ctx)?
+                    .into_iter())
+            }),
+            |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    (a_key, a_lsn) < (b_key, b_lsn)
+                })
+            },
+        )?;

        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys.into_iter();
+        let mut all_keys_iter = itertools::process_results(
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
+            |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    (a_key, a_lsn) < (b_key, b_lsn)
+                })
+            },
+        )?;

        stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -78,6 +78,9 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        scopeguard::defer! {
+            info!("eviction task finishing");
+        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -38,10 +38,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError,
-    TaskEvent, TaskHandle,
-};
+use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};

 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
@@ -422,19 +419,13 @@ impl ConnectionManagerState {
                match res {
                    Ok(()) => Ok(()),
                    Err(e) => {
-                        match e {
-                            WalReceiverError::SuccessfulCompletion(msg) => {
-                                info!("walreceiver connection handling ended with success: {msg}");
-                                Ok(())
-                            }
-                            WalReceiverError::ExpectedSafekeeperError(e) => {
-                                info!("walreceiver connection handling ended: {e}");
-                                Ok(())
-                            }
-                            WalReceiverError::Other(e) => {
-                                // give out an error to have task_mgr give it a really verbose logging
-                                Err(e).context("walreceiver connection handling failure")
-                            }
+                        use super::walreceiver_connection::ExpectedError;
+                        if e.is_expected() {
+                            info!("walreceiver connection handling ended: {e:#}");
+                            Ok(())
+                        } else {
+                            // give out an error to have task_mgr give it a really verbose logging
+                            Err(e).context("walreceiver connection handling failure")
                        }
                    }
                }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -8,14 +8,14 @@ use std::{
    time::{Duration, SystemTime},
 };

-use anyhow::{anyhow, Context};
+use anyhow::{bail, ensure, Context};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
 use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
+use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
@@ -60,50 +60,6 @@ pub(super) struct WalConnectionStatus {
    pub node: NodeId,
 }

-pub(super) enum WalReceiverError {
-    /// An error of a type that does not indicate an issue, e.g. a connection closing
-    ExpectedSafekeeperError(postgres::Error),
-    /// An "error" message that carries a SUCCESSFUL_COMPLETION status code.  Carries
-    /// the message part of the original postgres error
-    SuccessfulCompletion(String),
-    /// Generic error
-    Other(anyhow::Error),
-}
-
-impl From<tokio_postgres::Error> for WalReceiverError {
-    fn from(err: tokio_postgres::Error) -> Self {
-        if let Some(dberror) = err.as_db_error().filter(|db_error| {
-            db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-                && db_error.message().contains("ending streaming")
-        }) {
-            // Strip the outer DbError, which carries a misleading "error" severity
-            Self::SuccessfulCompletion(dberror.message().to_string())
-        } else if err.is_closed()
-            || err
-                .source()
-                .and_then(|source| source.downcast_ref::<std::io::Error>())
-                .map(is_expected_io_error)
-                .unwrap_or(false)
-        {
-            Self::ExpectedSafekeeperError(err)
-        } else {
-            Self::Other(anyhow::Error::new(err))
-        }
-    }
-}
-
-impl From<anyhow::Error> for WalReceiverError {
-    fn from(err: anyhow::Error) -> Self {
-        Self::Other(err)
-    }
-}
-
-impl From<WalDecodeError> for WalReceiverError {
-    fn from(err: WalDecodeError) -> Self {
-        Self::Other(anyhow::Error::new(err))
-    }
-}
-
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
 pub(super) async fn handle_walreceiver_connection(
@@ -114,7 +70,7 @@ pub(super) async fn handle_walreceiver_connection(
    connect_timeout: Duration,
    ctx: RequestContext,
    node: NodeId,
-) -> Result<(), WalReceiverError> {
+) -> anyhow::Result<()> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    WALRECEIVER_STARTED_CONNECTIONS.inc();
@@ -174,15 +130,11 @@ pub(super) async fn handle_walreceiver_connection(
                connection_result = connection => match connection_result {
                    Ok(()) => debug!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        match WalReceiverError::from(connection_error) {
-                            WalReceiverError::ExpectedSafekeeperError(_) => {
-                                // silence, because most likely we've already exited the outer call
-                                // with a similar error.
-                            },
-                            WalReceiverError::SuccessfulCompletion(_) => {}
-                            WalReceiverError::Other(err) => {
-                                warn!("Connection aborted: {err:#}")
-                            }
+                        if connection_error.is_expected() {
+                            // silence, because most likely we've already exited the outer call
+                            // with a similar error.
+                        } else {
+                            warn!("Connection aborted: {connection_error:#}")
                        }
                    }
                },
@@ -228,7 +180,7 @@ pub(super) async fn handle_walreceiver_connection(
    let mut startpoint = last_rec_lsn;

    if startpoint == Lsn(0) {
-        return Err(WalReceiverError::Other(anyhow!("No previous WAL position")));
+        bail!("No previous WAL position");
    }

    // There might be some padding after the last full record, skip it.
@@ -310,9 +262,7 @@ pub(super) async fn handle_walreceiver_connection(
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
-                        if !lsn.is_aligned() {
-                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
-                        }
+                        ensure!(lsn.is_aligned());

                        walingest
                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
@@ -469,3 +419,51 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
        Err(IdentifyError.into())
    }
 }
+
+/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
+pub(super) trait ExpectedError {
+    /// Test if this error is an ok error.
+    ///
+    /// We don't want to report connectivity problems as real errors towards connection manager because
+    /// 1. they happen frequently enough to make server logs hard to read and
+    /// 2. the connection manager can retry other safekeeper.
+    ///
+    /// If this function returns `true`, it's such an error.
+    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+    /// Connection manager will then handle reconnections.
+    ///
+    /// If this function returns an `false` the error should be propagated and the connection manager
+    /// will log the error at ERROR level.
+    fn is_expected(&self) -> bool;
+}
+
+impl ExpectedError for postgres::Error {
+    fn is_expected(&self) -> bool {
+        self.is_closed()
+            || self
+                .source()
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
+                .map(is_expected_io_error)
+                .unwrap_or(false)
+            || self
+                .as_db_error()
+                .filter(|db_error| {
+                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+                        && db_error.message().contains("ending streaming")
+                })
+                .is_some()
+    }
+}
+
+impl ExpectedError for anyhow::Error {
+    fn is_expected(&self) -> bool {
+        let head = self.downcast_ref::<postgres::Error>();
+
+        let tail = self
+            .chain()
+            .filter_map(|e| e.downcast_ref::<postgres::Error>());
+
+        // check if self or any of the chained/sourced errors are expected
+        head.into_iter().chain(tail).any(|e| e.is_expected())
+    }
+}
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -140,24 +140,36 @@ impl UploadQueue {
            }
        }

-        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
-        for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(
-                layer_name.to_owned(),
-                LayerFileMetadata::from(layer_metadata),
-            );
+        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
+        for layer_name in &index_part.timeline_layers {
+            match index_part
+                .layer_metadata
+                .get(layer_name)
+                .map(LayerFileMetadata::from)
+            {
+                Some(layer_metadata) => {
+                    files.insert(layer_name.to_owned(), layer_metadata);
+                }
+                None => {
+                    anyhow::bail!(
+                        "No remote layer metadata found for layer {}",
+                        layer_name.file_name()
+                    );
+                }
+            }
        }

+        let index_part_metadata = index_part.parse_metadata()?;
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part.metadata.disk_consistent_lsn()
+            index_part_metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part.metadata.clone(),
-            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+            latest_metadata: index_part_metadata.clone(),
+            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -172,7 +172,7 @@ lfc_change_limit_hook(int newval, void *extra)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
-			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 			return;
 		}
@@ -557,7 +557,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
-			elog(DEBUG2, "Swap file cache page");
+			elog(LOG, "Swap file cache page");
 		}
 		else
 		{
@@ -574,7 +574,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
-			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 		}
 	}
@@ -583,7 +583,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 		if (rc != BLCKSZ)
 		{
-			elog(WARNING, "Failed to write file cache: %m, disabling file cache");
+			elog(INFO, "Failed to write file cache: %m");
 			lfc_size_limit = 0; /* disable file cache */
 		}
 	}
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -10,7 +10,7 @@ use crate::{
    stream::PqStream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::info;

 pub(super) async fn authenticate(
    api: &impl console::Api,
@@ -55,17 +55,11 @@ pub(super) async fn authenticate(
    let mut num_retries = 0;
    let mut node = loop {
        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-                num_retries += 1;
-            }
-            Ok(ControlFlow::Break(n)) => break n,
+        match handle_try_wake(wake_res, num_retries)? {
+            ControlFlow::Continue(_) => num_retries += 1,
+            ControlFlow::Break(n) => break n,
        }
+        info!(num_retries, "retrying wake compute");
    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -230,8 +230,7 @@ pub struct PostgresConnection {
 }

 impl ConnCfg {
-    /// Connect to a corresponding compute node.
-    pub async fn connect(
+    async fn do_connect(
        &self,
        allow_self_signed_compute: bool,
        timeout: Duration,
@@ -271,6 +270,20 @@ impl ConnCfg {

        Ok(connection)
    }
+
+    /// Connect to a corresponding compute node.
+    pub async fn connect(
+        &self,
+        allow_self_signed_compute: bool,
+        timeout: Duration,
+    ) -> Result<PostgresConnection, ConnectionError> {
+        self.do_connect(allow_self_signed_compute, timeout)
+            .inspect_err(|err| {
+                // Immediately log the error we have at our disposal.
+                error!("couldn't connect to compute node: {err}");
+            })
+            .await
+    }
 }

 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -23,7 +23,7 @@ use tokio::{
    time,
 };
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
@@ -101,20 +101,21 @@ pub async fn task_main(
        tokio::select! {
            accept_result = listener.accept() => {
                let (socket, peer_addr) = accept_result?;
+                info!("accepted postgres client connection from {peer_addr}");

                let session_id = uuid::Uuid::new_v4();
                let cancel_map = Arc::clone(&cancel_map);
                connections.spawn(
                    async move {
-                        info!("accepted postgres client connection");
+                        info!("spawned a task for {peer_addr}");

                        socket
                            .set_nodelay(true)
                            .context("failed to set socket option")?;

-                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await
+                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp)
+                        .await
                    }
-                    .instrument(info_span!("handle_client", ?session_id, %peer_addr))
                    .unwrap_or_else(move |e| {
                        // Acknowledge that the task has finished with an error.
                        error!(?session_id, "per-client task finished with an error: {e:#}");
@@ -182,6 +183,7 @@ impl ClientMode {
    }
 }

+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
@@ -423,17 +425,11 @@ where
            auth::BackendType::Test(x) => x.wake_compute(),
        };

-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
+        match handle_try_wake(wake_res, num_retries)? {
            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
+            ControlFlow::Continue(_) => {}
            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
+            ControlFlow::Break(mut node_info) => {
                node_info.config.reuse_password(&config);
                mechanism.update_connect_config(&mut node_info.config);
                break node_info;
@@ -444,6 +440,7 @@ where
        num_retries += 1;

        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying wake compute");
    };

    // now that we have a new node, try connect to it repeatedly.
@@ -454,12 +451,10 @@ where
        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
            Ok(res) => return Ok(res),
            Err(e) => {
-                let retriable = e.should_retry(num_retries);
-                if !retriable {
-                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                error!(error = ?e, "could not connect to compute node");
+                if !e.should_retry(num_retries) {
                    return Err(e.into());
                }
-                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
            }
        }

@@ -467,6 +462,7 @@ where
        num_retries += 1;

        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying connect_once");
    }
 }

--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -79,10 +79,6 @@ struct Args {
    /// Listen http endpoint for management and metrics in the form host:port.
    #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
    listen_http: String,
-    /// Advertised endpoint for receiving/sending WAL in the form host:port. If not
-    /// specified, listen_pg is used to advertise instead.
-    #[arg(long, default_value = None)]
-    advertise_pg: Option<String>,
    /// Availability zone of the safekeeper.
    #[arg(long)]
    availability_zone: Option<String>,
@@ -189,7 +185,6 @@ async fn main() -> anyhow::Result<()> {
        listen_pg_addr: args.listen_pg,
        listen_pg_addr_tenant_only: args.listen_pg_tenant_only,
        listen_http_addr: args.listen_http,
-        advertise_pg_addr: args.advertise_pg,
        availability_zone: args.availability_zone,
        no_sync: args.no_sync,
        broker_endpoint: args.broker_endpoint,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -55,7 +55,6 @@ pub struct SafeKeeperConf {
    pub listen_pg_addr: String,
    pub listen_pg_addr_tenant_only: Option<String>,
    pub listen_http_addr: String,
-    pub advertise_pg_addr: Option<String>,
    pub availability_zone: Option<String>,
    pub no_sync: bool,
    pub broker_endpoint: Uri,
@@ -89,7 +88,6 @@ impl SafeKeeperConf {
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_pg_addr_tenant_only: None,
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
-            advertise_pg_addr: None,
            availability_zone: None,
            remote_storage: None,
            my_id: NodeId(0),
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -568,9 +568,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            {
                if self.tli.should_walsender_stop(remote_consistent_lsn).await {
                    // Terminate if there is nothing more to send.
-                    // Note that "ending streaming" part of the string is used by
-                    // pageserver to identify WalReceiverError::SuccessfulCompletion,
-                    // do not change this string without updating pageserver.
                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
                        self.appname, self.start_pos,
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -237,10 +237,7 @@ impl SharedState {
            commit_lsn: self.sk.inmem.commit_lsn.0,
            remote_consistent_lsn: remote_consistent_lsn.0,
            peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0,
-            safekeeper_connstr: conf
-                .advertise_pg_addr
-                .to_owned()
-                .unwrap_or(conf.listen_pg_addr.clone()),
+            safekeeper_connstr: conf.listen_pg_addr.clone(),
            backup_lsn: self.sk.inmem.backup_lsn.0,
            local_start_lsn: self.sk.state.local_start_lsn.0,
            availability_zone: conf.availability_zone.clone(),
--- a/test_runner/fixtures/broker.py
+++ b/test_runner/fixtures/broker.py
@@ -1,60 +0,0 @@
-import subprocess
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Optional
-
-from fixtures.log_helper import log
-
-
-@dataclass
-class NeonBroker:
-    """An object managing storage_broker instance"""
-
-    logfile: Path
-    port: int
-    neon_binpath: Path
-    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
-
-    def listen_addr(self):
-        return f"127.0.0.1:{self.port}"
-
-    def client_url(self):
-        return f"http://{self.listen_addr()}"
-
-    def check_status(self):
-        return True  # TODO
-
-    def try_start(self):
-        if self.handle is not None:
-            log.debug(f"storage_broker is already running on port {self.port}")
-            return
-
-        listen_addr = self.listen_addr()
-        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
-        with open(self.logfile, "wb") as logfile:
-            args = [
-                str(self.neon_binpath / "storage_broker"),
-                f"--listen-addr={listen_addr}",
-            ]
-            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
-
-        # wait for start
-        started_at = time.time()
-        while True:
-            try:
-                self.check_status()
-            except Exception as e:
-                elapsed = time.time() - started_at
-                if elapsed > 5:
-                    raise RuntimeError(
-                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
-                    ) from e
-                time.sleep(0.5)
-            else:
-                break  # success
-
-    def stop(self):
-        if self.handle is not None:
-            self.handle.terminate()
-            self.handle.wait()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2,11 +2,13 @@ from __future__ import annotations

 import abc
 import asyncio
+import enum
 import filecmp
 import json
 import os
 import re
 import shutil
+import socket
 import subprocess
 import tempfile
 import textwrap
@@ -15,11 +17,12 @@ import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
+from enum import Flag, auto
 from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import urlparse

 import asyncpg
@@ -39,21 +42,10 @@ from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
 from typing_extensions import Literal

-from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.pg_version import PgVersion
-from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import (
-    LocalFsStorage,
-    MockS3Server,
-    RemoteStorage,
-    RemoteStorageKind,
-    RemoteStorageUsers,
-    S3Storage,
-    remote_storage_to_toml_inline_table,
-)
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
@@ -86,6 +78,19 @@ DEFAULT_OUTPUT_DIR: str = "test_output"
 DEFAULT_BRANCH_NAME: str = "main"

 BASE_PORT: int = 15000
+WORKER_PORT_NUM: int = 1000
+
+
+def pytest_configure(config: Config):
+    """
+    Check that we do not overflow available ports range.
+    """
+
+    numprocesses = config.getoption("numprocesses")
+    if (
+        numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768
+    ):  # do not use ephemeral ports
+        raise Exception("Too many workers configured. Cannot distribute ports for services.")


@pytest.fixture(scope="session")
@@ -187,11 +192,6 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
    return scope


-@pytest.fixture(scope="session")
-def worker_port_num():
-    return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
-
-
@pytest.fixture(scope="session")
 def worker_seq_no(worker_id: str) -> int:
    # worker_id is a pytest-xdist fixture
@@ -204,10 +204,10 @@ def worker_seq_no(worker_id: str) -> int:


@pytest.fixture(scope="session")
-def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
-    # so we divide ports in ranges of ports
+def worker_base_port(worker_seq_no: int) -> int:
+    # so we divide ports in ranges of 100 ports
    # so workers have disjoint set of ports for services
-    return BASE_PORT + worker_seq_no * worker_port_num
+    return BASE_PORT + worker_seq_no * WORKER_PORT_NUM


 def get_dir_size(path: str) -> int:
@@ -220,9 +220,80 @@ def get_dir_size(path: str) -> int:
    return totalbytes


+def can_bind(host: str, port: int) -> bool:
+    """
+    Check whether a host:port is available to bind for listening
+
+    Inspired by the can_bind() perl function used in Postgres tests, in
+    vendor/postgres-v14/src/test/perl/PostgresNode.pm
+    """
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the
+        # moment. If that changes, we should use start using SO_REUSEADDR here
+        # too, to allow reusing ports more quickly.
+        # See https://github.com/neondatabase/neon/issues/801
+        # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+
+        try:
+            sock.bind((host, port))
+            sock.listen()
+            return True
+        except socket.error:
+            log.info(f"Port {port} is in use, skipping")
+            return False
+        finally:
+            sock.close()
+
+
+class PortDistributor:
+    def __init__(self, base_port: int, port_number: int):
+        self.iterator = iter(range(base_port, base_port + port_number))
+        self.port_map: Dict[int, int] = {}
+
+    def get_port(self) -> int:
+        for port in self.iterator:
+            if can_bind("localhost", port):
+                return port
+        raise RuntimeError(
+            "port range configured for test is exhausted, consider enlarging the range"
+        )
+
+    def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]:
+        """
+        Returns a new port for a port number in a string (like "localhost:1234") or int.
+        Replacements are memorised, so a substitution for the same port is always the same.
+        """
+
+        # TODO: replace with structural pattern matching for Python >= 3.10
+        if isinstance(value, int):
+            return self._replace_port_int(value)
+
+        if isinstance(value, str):
+            return self._replace_port_str(value)
+
+        raise TypeError(f"unsupported type {type(value)} of {value=}")
+
+    def _replace_port_int(self, value: int) -> int:
+        known_port = self.port_map.get(value)
+        if known_port is None:
+            known_port = self.port_map[value] = self.get_port()
+
+        return known_port
+
+    def _replace_port_str(self, value: str) -> str:
+        # Use regex to find port in a string
+        # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
+        # See https://bugs.python.org/issue27657
+        ports = re.findall(r":(\d+)(?:/|$)", value)
+        assert len(ports) == 1, f"can't find port in {value}"
+        port_int = int(ports[0])
+
+        return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}")
+
+
@pytest.fixture(scope="session")
-def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
-    return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)
+def port_distributor(worker_base_port: int) -> PortDistributor:
+    return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)


@pytest.fixture(scope="session")
@@ -393,6 +464,140 @@ class AuthKeys:
        return self.generate_token(scope="tenant", tenant_id=str(tenant_id))


+class MockS3Server:
+    """
+    Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely.
+    Relies that `poetry` and `moto` server are installed, since it's the way the tests are run.
+
+    Also provides a set of methods to derive the connection properties from and the method to kill the underlying server.
+    """
+
+    def __init__(
+        self,
+        port: int,
+    ):
+        self.port = port
+
+        # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
+        # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
+        # if a process is started from the shell process.
+        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"])
+        error = None
+        try:
+            return_code = self.subprocess.poll()
+            if return_code is not None:
+                error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
+        except Exception as e:
+            error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
+        if error is not None:
+            log.error(error)
+            self.kill()
+            raise RuntimeError("failed to start s3 mock server")
+
+    def endpoint(self) -> str:
+        return f"http://127.0.0.1:{self.port}"
+
+    def region(self) -> str:
+        return "us-east-1"
+
+    def access_key(self) -> str:
+        return "test"
+
+    def secret_key(self) -> str:
+        return "test"
+
+    def kill(self):
+        self.subprocess.kill()
+
+
+@enum.unique
+class RemoteStorageKind(str, enum.Enum):
+    LOCAL_FS = "local_fs"
+    MOCK_S3 = "mock_s3"
+    REAL_S3 = "real_s3"
+    # Pass to tests that are generic to remote storage
+    # to ensure the test pass with or without the remote storage
+    NOOP = "noop"
+
+
+def available_remote_storages() -> List[RemoteStorageKind]:
+    remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
+        remote_storages.append(RemoteStorageKind.REAL_S3)
+        log.info("Enabling real s3 storage for tests")
+    else:
+        log.info("Using mock implementations to test remote storage")
+    return remote_storages
+
+
+def available_s3_storages() -> List[RemoteStorageKind]:
+    remote_storages = [RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
+        remote_storages.append(RemoteStorageKind.REAL_S3)
+        log.info("Enabling real s3 storage for tests")
+    else:
+        log.info("Using mock implementations to test remote storage")
+    return remote_storages
+
+
+@dataclass
+class LocalFsStorage:
+    root: Path
+
+
+@dataclass
+class S3Storage:
+    bucket_name: str
+    bucket_region: str
+    access_key: str
+    secret_key: str
+    endpoint: Optional[str] = None
+    prefix_in_bucket: Optional[str] = ""
+
+    def access_env_vars(self) -> Dict[str, str]:
+        return {
+            "AWS_ACCESS_KEY_ID": self.access_key,
+            "AWS_SECRET_ACCESS_KEY": self.secret_key,
+        }
+
+    def to_string(self) -> str:
+        return json.dumps(
+            {
+                "bucket": self.bucket_name,
+                "region": self.bucket_region,
+                "endpoint": self.endpoint,
+                "prefix": self.prefix_in_bucket,
+            }
+        )
+
+
+RemoteStorage = Union[LocalFsStorage, S3Storage]
+
+
+# serialize as toml inline table
+def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
+    if isinstance(remote_storage, LocalFsStorage):
+        remote_storage_config = f"local_path='{remote_storage.root}'"
+    elif isinstance(remote_storage, S3Storage):
+        remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\
+            bucket_region='{remote_storage.bucket_region}'"
+
+        if remote_storage.prefix_in_bucket is not None:
+            remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'"
+
+        if remote_storage.endpoint is not None:
+            remote_storage_config += f",endpoint='{remote_storage.endpoint}'"
+    else:
+        raise Exception("invalid remote storage type")
+
+    return f"{{{remote_storage_config}}}"
+
+
+class RemoteStorageUsers(Flag):
+    PAGESERVER = auto()
+    SAFEKEEPER = auto()
+
+
 class NeonEnvBuilder:
    """
    Builder object to create a Neon runtime environment
@@ -468,7 +673,7 @@ class NeonEnvBuilder:

        # Prepare the default branch to start the postgres on later.
        # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
-        log.debug(
+        log.info(
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
@@ -1390,11 +1595,7 @@ class NeonCli(AbstractNeonCli):
        if endpoint_id is not None:
            args.append(endpoint_id)

-        s3_env_vars = None
-        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
-            s3_env_vars = self.env.remote_storage.access_env_vars()
-
-        res = self.raw_cli(args, extra_env_vars=s3_env_vars)
+        res = self.raw_cli(args)
        res.check_returncode()
        return res

@@ -1486,6 +1687,7 @@ class NeonPageserver(PgProtocol):
            # FIXME: replication patch for tokio_postgres regards  any but CopyDone/CopyData message in CopyBoth stream as unexpected
            ".*Connection aborted: unexpected message from server*",
            ".*kill_and_wait_impl.*: wait successful.*",
+            ".*: db error:.*ending streaming to Some.*",
            ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
            ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
            # safekeeper connection can fail with this, in the window between timeline creation
@@ -1495,6 +1697,9 @@ class NeonPageserver(PgProtocol):
            ".*Error processing HTTP request: Forbidden",
            # intentional failpoints
            ".*failpoint ",
+            # FIXME: there is a race condition between GC and detach, see
+            # https://github.com/neondatabase/neon/issues/2442
+            ".*could not remove ephemeral file.*No such file or directory.*",
            # FIXME: These need investigation
            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
@@ -1517,7 +1722,6 @@ class NeonPageserver(PgProtocol):
            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
            # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
            ".*Error processing HTTP request: NotFound: Timeline .* was not found",
-            ".*took more than expected to complete.*",
        ]

    def start(
@@ -2691,6 +2895,59 @@ class SafekeeperHttpClient(requests.Session):
        return metrics


+@dataclass
+class NeonBroker:
+    """An object managing storage_broker instance"""
+
+    logfile: Path
+    port: int
+    neon_binpath: Path
+    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
+
+    def listen_addr(self):
+        return f"127.0.0.1:{self.port}"
+
+    def client_url(self):
+        return f"http://{self.listen_addr()}"
+
+    def check_status(self):
+        return True  # TODO
+
+    def try_start(self):
+        if self.handle is not None:
+            log.debug(f"storage_broker is already running on port {self.port}")
+            return
+
+        listen_addr = self.listen_addr()
+        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
+        with open(self.logfile, "wb") as logfile:
+            args = [
+                str(self.neon_binpath / "storage_broker"),
+                f"--listen-addr={listen_addr}",
+            ]
+            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
+
+        # wait for start
+        started_at = time.time()
+        while True:
+            try:
+                self.check_status()
+            except Exception as e:
+                elapsed = time.time() - started_at
+                if elapsed > 5:
+                    raise RuntimeError(
+                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
+                    ) from e
+                time.sleep(0.5)
+            else:
+                break  # success
+
+    def stop(self):
+        if self.handle is not None:
+            self.handle.terminate()
+            self.handle.wait()
+
+
 def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
    """Compute the working directory for an individual test."""
    test_name = request.node.name
--- a/test_runner/fixtures/port_distributor.py
+++ b/test_runner/fixtures/port_distributor.py
@@ -1,77 +0,0 @@
-import re
-import socket
-from contextlib import closing
-from typing import Dict, Union
-
-from fixtures.log_helper import log
-
-
-def can_bind(host: str, port: int) -> bool:
-    """
-    Check whether a host:port is available to bind for listening
-
-    Inspired by the can_bind() perl function used in Postgres tests, in
-    vendor/postgres-v14/src/test/perl/PostgresNode.pm
-    """
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the
-        # moment. If that changes, we should use start using SO_REUSEADDR here
-        # too, to allow reusing ports more quickly.
-        # See https://github.com/neondatabase/neon/issues/801
-        # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-
-        try:
-            sock.bind((host, port))
-            sock.listen()
-            return True
-        except socket.error:
-            log.info(f"Port {port} is in use, skipping")
-            return False
-        finally:
-            sock.close()
-
-
-class PortDistributor:
-    def __init__(self, base_port: int, port_number: int):
-        self.iterator = iter(range(base_port, base_port + port_number))
-        self.port_map: Dict[int, int] = {}
-
-    def get_port(self) -> int:
-        for port in self.iterator:
-            if can_bind("localhost", port):
-                return port
-        raise RuntimeError(
-            "port range configured for test is exhausted, consider enlarging the range"
-        )
-
-    def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]:
-        """
-        Returns a new port for a port number in a string (like "localhost:1234") or int.
-        Replacements are memorised, so a substitution for the same port is always the same.
-        """
-
-        # TODO: replace with structural pattern matching for Python >= 3.10
-        if isinstance(value, int):
-            return self._replace_port_int(value)
-
-        if isinstance(value, str):
-            return self._replace_port_str(value)
-
-        raise TypeError(f"unsupported type {type(value)} of {value=}")
-
-    def _replace_port_int(self, value: int) -> int:
-        known_port = self.port_map.get(value)
-        if known_port is None:
-            known_port = self.port_map[value] = self.get_port()
-
-        return known_port
-
-    def _replace_port_str(self, value: str) -> str:
-        # Use regex to find port in a string
-        # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
-        # See https://bugs.python.org/issue27657
-        ports = re.findall(r":(\d+)(?:/|$)", value)
-        assert len(ports) == 1, f"can't find port in {value}"
-        port_int = int(ports[0])
-
-        return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}")
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -1,143 +0,0 @@
-import enum
-import json
-import os
-import subprocess
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-from fixtures.log_helper import log
-
-
-class MockS3Server:
-    """
-    Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely.
-    Relies that `poetry` and `moto` server are installed, since it's the way the tests are run.
-
-    Also provides a set of methods to derive the connection properties from and the method to kill the underlying server.
-    """
-
-    def __init__(
-        self,
-        port: int,
-    ):
-        self.port = port
-
-        # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
-        # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
-        # if a process is started from the shell process.
-        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"])
-        error = None
-        try:
-            return_code = self.subprocess.poll()
-            if return_code is not None:
-                error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
-        except Exception as e:
-            error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
-        if error is not None:
-            log.error(error)
-            self.kill()
-            raise RuntimeError("failed to start s3 mock server")
-
-    def endpoint(self) -> str:
-        return f"http://127.0.0.1:{self.port}"
-
-    def region(self) -> str:
-        return "us-east-1"
-
-    def access_key(self) -> str:
-        return "test"
-
-    def secret_key(self) -> str:
-        return "test"
-
-    def kill(self):
-        self.subprocess.kill()
-
-
-@enum.unique
-class RemoteStorageKind(str, enum.Enum):
-    LOCAL_FS = "local_fs"
-    MOCK_S3 = "mock_s3"
-    REAL_S3 = "real_s3"
-    # Pass to tests that are generic to remote storage
-    # to ensure the test pass with or without the remote storage
-    NOOP = "noop"
-
-
-def available_remote_storages() -> List[RemoteStorageKind]:
-    remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
-        remote_storages.append(RemoteStorageKind.REAL_S3)
-        log.info("Enabling real s3 storage for tests")
-    else:
-        log.info("Using mock implementations to test remote storage")
-    return remote_storages
-
-
-def available_s3_storages() -> List[RemoteStorageKind]:
-    remote_storages = [RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
-        remote_storages.append(RemoteStorageKind.REAL_S3)
-        log.info("Enabling real s3 storage for tests")
-    else:
-        log.info("Using mock implementations to test remote storage")
-    return remote_storages
-
-
-@dataclass
-class LocalFsStorage:
-    root: Path
-
-
-@dataclass
-class S3Storage:
-    bucket_name: str
-    bucket_region: str
-    access_key: str
-    secret_key: str
-    endpoint: Optional[str] = None
-    prefix_in_bucket: Optional[str] = ""
-
-    def access_env_vars(self) -> Dict[str, str]:
-        return {
-            "AWS_ACCESS_KEY_ID": self.access_key,
-            "AWS_SECRET_ACCESS_KEY": self.secret_key,
-        }
-
-    def to_string(self) -> str:
-        return json.dumps(
-            {
-                "bucket": self.bucket_name,
-                "region": self.bucket_region,
-                "endpoint": self.endpoint,
-                "prefix": self.prefix_in_bucket,
-            }
-        )
-
-
-RemoteStorage = Union[LocalFsStorage, S3Storage]
-
-
-# serialize as toml inline table
-def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
-    if isinstance(remote_storage, LocalFsStorage):
-        remote_storage_config = f"local_path='{remote_storage.root}'"
-    elif isinstance(remote_storage, S3Storage):
-        remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\
-            bucket_region='{remote_storage.bucket_region}'"
-
-        if remote_storage.prefix_in_bucket is not None:
-            remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'"
-
-        if remote_storage.endpoint is not None:
-            remote_storage_config += f",endpoint='{remote_storage.endpoint}'"
-    else:
-        raise Exception("invalid remote storage type")
-
-    return f"{{{remote_storage_config}}}"
-
-
-class RemoteStorageUsers(enum.Flag):
-    PAGESERVER = enum.auto()
-    SAFEKEEPER = enum.auto()
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -3,11 +3,12 @@ from typing import Generator, Optional

 import pytest
 from fixtures.neon_fixtures import (
+    LocalFsStorage,
    NeonEnv,
    NeonEnvBuilder,
+    RemoteStorageKind,
 )
 from fixtures.pageserver.http import PageserverApiException, TenantConfig
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId
 from fixtures.utils import wait_until

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -13,6 +13,7 @@ from fixtures.neon_fixtures import (
    NeonCli,
    NeonEnvBuilder,
    PgBin,
+    PortDistributor,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -21,7 +22,6 @@ from fixtures.pageserver.utils import (
    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.port_distributor import PortDistributor
 from fixtures.types import Lsn
 from pytest import FixtureRequest

@@ -337,7 +337,6 @@ def check_neon_works(
    config.pg_version = pg_version
    config.initial_tenant = snapshot_config["default_tenant_id"]
    config.pg_distrib_dir = pg_distrib_dir
-    config.remote_storage = None

    # Use the "target" binaries to launch the storage nodes
    config_target = config
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -7,14 +7,15 @@ import pytest
 import toml
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LocalFsStorage,
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
+    RemoteStorageKind,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until

--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -8,9 +8,10 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_s3_storages,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.remote_storage import RemoteStorageKind, available_s3_storages


 # Cleaning up downloaded files is important for local tests
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -33,4 +33,4 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    time.sleep(10)  # let compaction to be performed
    assert env.pageserver.log_contains("compact-level0-phase1-return-same")

-    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr])
+    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -5,9 +5,9 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
+    PortDistributor,
    VanillaPostgres,
 )
-from fixtures.port_distributor import PortDistributor
 from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar, subprocess_capture

--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -8,9 +8,9 @@ from fixtures.neon_fixtures import (
    Endpoint,
    NeonEnv,
    NeonEnvBuilder,
+    RemoteStorageKind,
    wait_for_last_flush_lsn,
 )
-from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import query_scalar

--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -4,10 +4,10 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
+    RemoteStorageKind,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar

--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -13,11 +13,11 @@ from fixtures.neon_fixtures import (
    PSQL,
    NeonEnvBuilder,
    NeonProxy,
+    PortDistributor,
+    RemoteStorageKind,
    VanillaPostgres,
    wait_for_last_flush_lsn,
 )
-from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import query_scalar
 from pytest_httpserver import HTTPServer
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -1,5 +1,4 @@
-from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.port_distributor import PortDistributor
+from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor


 # Test that neon cli is able to start and stop all processes with the user defaults.
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -10,6 +10,8 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
@@ -21,7 +23,6 @@ from fixtures.pageserver.utils import (
    wait_for_upload_queue_empty,
    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until

--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -12,7 +12,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):

    env.neon_cli.create_branch("test_pageserver_restart")
    endpoint = env.endpoints.create_start("test_pageserver_restart")
-    pageserver_http = env.pageserver.http_client()

    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()
@@ -53,11 +52,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
    # pageserver does if a compute node connects and sends a request for the tenant
    # while it's still in Loading state. (It waits for the loading to finish, and then
    # processes the request.)
-    tenant_load_delay_ms = 5000
    env.pageserver.stop()
-    env.pageserver.start(
-        extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"}
-    )
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-loading-tenant=return(5000)"})

    # Check that it's in Loading state
    client = env.pageserver.http_client()
@@ -69,41 +65,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
    cur.execute("SELECT count(*) FROM foo")
    assert cur.fetchone() == (100000,)

-    # Validate startup time metrics
-    metrics = pageserver_http.get_metrics()
-
-    # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
-    expectations = {
-        "initial": lambda t, p: True,  # make no assumptions about the initial time point, it could be 0 in theory
-        # Initial tenant load should reflect the delay we injected
-        "initial_tenant_load": lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p,
-        # Subsequent steps should occur in expected order
-        "initial_logical_sizes": lambda t, p: t > 0 and t >= p,
-        "background_jobs_can_start": lambda t, p: t > 0 and t >= p,
-        "complete": lambda t, p: t > 0 and t >= p,
-    }
-
-    prev_value = None
-    for sample in metrics.query_all("pageserver_startup_duration_seconds"):
-        labels = dict(sample.labels)
-        phase = labels["phase"]
-        log.info(f"metric {phase}={sample.value}")
-        assert phase in expectations, f"Unexpected phase {phase}"
-        assert expectations[phase](
-            sample.value, prev_value
-        ), f"Unexpected value for {phase}: {sample.value}"
-        prev_value = sample.value
-
-    # Startup is complete, this metric should exist but be zero
-    assert metrics.query_one("pageserver_startup_is_loading").value == 0
-
-    # This histogram should have been populated, although we aren't specific about exactly
-    # which bucket values: just nonzero
-    assert any(
-        bucket.value > 0
-        for bucket in metrics.query_all("pageserver_tenant_activation_seconds_bucket")
-    )
-

 # Test that repeatedly kills and restarts the page server, while the
 # safekeeper and compute node keep running.
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -12,7 +12,10 @@ from typing import Dict, List, Optional, Tuple
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LocalFsStorage,
    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
@@ -23,11 +26,6 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_active,
    wait_until_tenant_state,
 )
-from fixtures.remote_storage import (
-    LocalFsStorage,
-    RemoteStorageKind,
-    available_remote_storages,
-)
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import print_gc_result, query_scalar, wait_until
 from requests import ReadTimeout
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -6,8 +6,7 @@ from typing import Optional, Type

 import backoff
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import PgProtocol, VanillaPostgres
-from fixtures.port_distributor import PortDistributor
+from fixtures.neon_fixtures import PgProtocol, PortDistributor, VanillaPostgres


 def generate_tls_cert(cn, certout, keyout):
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -4,10 +4,11 @@ from contextlib import closing
 import psycopg2.extras
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    LocalFsStorage,
    NeonEnvBuilder,
+    RemoteStorageKind,
 )
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import Lsn
 from fixtures.utils import wait_until

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -11,6 +11,8 @@ from fixtures.neon_fixtures import (
    Endpoint,
    NeonEnv,
    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -18,7 +20,6 @@ from fixtures.pageserver.utils import (
    wait_for_upload,
    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -7,12 +7,15 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Tuple

 import pytest
-from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    Endpoint,
+    NeonBroker,
    NeonEnv,
    NeonEnvBuilder,
+    PortDistributor,
+    RemoteStorageKind,
+    available_remote_storages,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -21,8 +24,6 @@ from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
    wait_for_upload,
 )
-from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
    query_scalar,
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -18,9 +18,10 @@ from fixtures.metrics import (
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
 )
 from fixtures.pageserver.utils import timeline_delete_wait_completed
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -16,8 +16,11 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    Endpoint,
+    LocalFsStorage,
    NeonEnv,
    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
    last_flush_lsn_upload,
 )
 from fixtures.pageserver.utils import (
@@ -25,11 +28,6 @@ from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
    wait_for_upload,
 )
-from fixtures.remote_storage import (
-    LocalFsStorage,
-    RemoteStorageKind,
-    available_remote_storages,
-)
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until

--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -6,10 +6,10 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
+    RemoteStorageKind,
    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import LayerMapInfo
-from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import TimelineId
 from pytest_httpserver import HTTPServer

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -13,6 +13,9 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
+    RemoteStorageKind,
+    S3Storage,
+    available_remote_storages,
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
@@ -25,11 +28,6 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_active,
    wait_until_timeline_state,
 )
-from fixtures.remote_storage import (
-    RemoteStorageKind,
-    S3Storage,
-    available_remote_storages,
-)
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until

@@ -272,23 +270,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            wait_timeline_detail_404(
                ps_http, env.initial_tenant, timeline_id, iterations=iterations
            )
-
-            if failpoint == "timeline-delete-after-index-delete":
-                m = ps_http.get_metrics()
-                assert (
-                    m.query_one(
-                        "remote_storage_s3_request_seconds_count",
-                        filter={"request_type": "get_object", "result": "err"},
-                    ).value
-                    == 1
-                )
-                assert (
-                    m.query_one(
-                        "remote_storage_s3_request_seconds_count",
-                        filter={"request_type": "get_object", "result": "ok"},
-                    ).value
-                    == 1
-                )
    elif check is Check.RETRY_WITHOUT_RESTART:
        # this should succeed
        # this also checks that delete can be retried even when timeline is in Broken state
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -16,6 +16,8 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
+    PortDistributor,
+    RemoteStorageKind,
    VanillaPostgres,
    wait_for_last_flush_lsn,
 )
@@ -27,8 +29,6 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import get_timeline_dir_size, wait_until

--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -15,18 +15,22 @@ from typing import Any, List, Optional

 import psycopg2
 import pytest
-from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    Endpoint,
+    NeonBroker,
    NeonEnv,
    NeonEnvBuilder,
    NeonPageserver,
    PgBin,
    PgProtocol,
+    PortDistributor,
+    RemoteStorageKind,
+    RemoteStorageUsers,
    Safekeeper,
    SafekeeperHttpClient,
    SafekeeperPort,
+    available_remote_storages,
 )
 from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
@@ -34,12 +38,6 @@ from fixtures.pageserver.utils import (
    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import (
-    RemoteStorageKind,
-    RemoteStorageUsers,
-    available_remote_storages,
-)
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background

--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -245,7 +245,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder):
    # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments
    # are not removed before broadcasted to all safekeepers, with the help of replication slot
    asyncio.run(
-        run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=4)
+        run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5)
    )


--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -5,9 +5,9 @@ import pytest
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
+    PortDistributor,
    VanillaPostgres,
 )
-from fixtures.port_distributor import PortDistributor
 from fixtures.types import TenantId, TimelineId