Fix unrelated test

fmt
revert test changes
2026-02-18 01:50:37 +00:00 · 2023-06-09 10:27:54 -04:00 · 2023-06-09 09:57:06 -04:00 · 2023-06-09 09:50:05 -04:00 · 2023-06-09 00:39:06 -04:00 · 2023-06-08 22:52:52 -04:00
65 changed files with 1905 additions and 3928 deletions
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -67,7 +67,7 @@ RUN apt update && \
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

@@ -95,7 +95,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    cmake .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -355,7 +355,7 @@ RUN apt-get update && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -410,7 +410,7 @@ RUN apt-get update && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    cmake .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -432,54 +432,6 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control

-#########################################################################################
-#
-# Layer "rdkit-pg-build"
-# compile rdkit extension
-#
-#########################################################################################
-FROM build-deps AS rdkit-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-RUN apt-get update && \
-    apt-get install -y \
-        cmake \
-        libboost-iostreams1.74-dev \
-        libboost-regex1.74-dev \
-        libboost-serialization1.74-dev \
-        libboost-system1.74-dev \
-        libeigen3-dev \
-        libfreetype6-dev
-
-ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
-    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
-    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
-    cmake \
-        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
-        -D RDK_BUILD_INCHI_SUPPORT=ON \
-        -D RDK_BUILD_AVALON_SUPPORT=ON \
-        -D RDK_BUILD_PYTHON_WRAPPERS=OFF \
-        -D RDK_BUILD_DESCRIPTORS3D=OFF \
-        -D RDK_BUILD_FREESASA_SUPPORT=OFF \
-        -D RDK_BUILD_COORDGEN_SUPPORT=ON \
-        -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
-        -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
-        -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
-        -D RDK_USE_URF=OFF \
-        -D RDK_BUILD_PGSQL=ON \
-        -D RDK_PGSQL_STATIC=ON \
-        -D PostgreSQL_CONFIG=pg_config \
-        -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
-        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-        -D RDK_INSTALL_INTREE=OFF \
-        -D CMAKE_BUILD_TYPE=Release \
-        . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
-
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -612,7 +564,6 @@ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -686,19 +637,14 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
-# libboost*, libfreetype6, and zlib1g for rdkit
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
+        locales \
        libicu67 \
        liblz4-1 \
        libreadline8 \
-        libboost-iostreams1.74.0 \
-        libboost-regex1.74.0 \
-        libboost-serialization1.74.0 \
-        libboost-system1.74.0 \
        libossp-uuid16 \
-        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -708,9 +654,7 @@ RUN apt update &&  \
        libxslt1.1 \
        libzstd1 \
        libcurl4-openssl-dev \
-        locales \
-        procps \
-        zlib1g && \
+        procps && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -370,6 +370,11 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

+        info!(
+            "finished configuration of compute for project {}",
+            spec.cluster.cluster_id.as_deref().unwrap_or("None")
+        );
+
        Ok(())
    }

@@ -422,23 +427,57 @@ impl ComputeNode {
    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
-        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            pspec.tenant_id,
-            pspec.timeline_id,
+            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            spec.tenant_id,
+            spec.timeline_id,
        );

        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();

-        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(spec.storage_auth_token.clone())?;

-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            self.apply_config(&compute_state)?;
+        // Maybe apply the spec
+        if spec.spec.mode == ComputeMode::Primary {
+            let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
+
+            // Get spec_id or make it up by hashing
+            //
+            // TODO Make spec_id required so there would be no need to hash.
+            let spec_id = spec.operation_uuid.clone().unwrap_or_else(|| {
+                use std::collections::hash_map::DefaultHasher;
+                use std::hash::{Hash, Hasher};
+
+                // HACK Exclude postgresql.conf because it doesn't need
+                //      to be applied like the other fields in the spec
+                let mut spec_no_conf = spec.clone();
+                spec_no_conf.cluster.postgresql_conf = None;
+
+                let json = serde_json::to_vec(&spec_no_conf).unwrap();
+                let mut hasher = DefaultHasher::new();
+                json.hash(&mut hasher);
+                let hash = hasher.finish();
+                format!("{:x}", hash)
+            });
+
+            // Get current spec_id
+            let path = Path::new(&self.pgdata).join("neon_compute_spec_id.txt");
+            let current_spec_id = std::fs::read_to_string(path).ok();
+
+            // Respec if needed
+            if current_spec_id == Some(spec_id.clone()) {
+                info!("no need to respec");
+            } else {
+                info!("respeccing {:?} {:?}", current_spec_id, &spec_id);
+
+                self.apply_config(&compute_state)?;
+                self.cache_spec_id(&compute_state, spec_id)?;
+            }
        }

        let startup_end_time = Utc::now();
@@ -457,14 +496,31 @@ impl ComputeNode {
        }
        self.set_status(ComputeStatus::Running);

-        info!(
-            "finished configuration of compute for project {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
        Ok(pg)
    }

+    fn cache_spec_id(&self, compute_state: &ComputeState, spec_id: String) -> anyhow::Result<()> {
+        let spec = &compute_state.pspec.as_ref().expect("spec must be set");
+        let cmd = format!(
+            "set_compute_spec_id {} {} {}",
+            spec.tenant_id, spec.timeline_id, spec_id,
+        );
+        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
+
+        // Use the storage auth token from the config file, if given.
+        // Note: this overrides any password set in the connection string.
+        if let Some(storage_auth_token) = &spec.storage_auth_token {
+            info!("Got storage auth token from spec file");
+            config.password(storage_auth_token);
+        } else {
+            info!("Storage auth token not set");
+        }
+        let mut client = config.connect(NoTls)?;
+        client.simple_query(&cmd)?;
+
+        Ok(())
+    }
+
    // Look for core dumps and collect backtraces.
    //
    // EKS worker nodes have following core dump settings:
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -308,8 +308,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_flag("force");
-    env.init(pg_version, force)
+    env.init(pg_version)
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
@@ -1014,13 +1013,6 @@ fn cli() -> Command {
        .help("If set, the node will be a hot replica on the specified timeline")
        .required(false);

-    let force_arg = Arg::new("force")
-        .value_parser(value_parser!(bool))
-        .long("force")
-        .action(ArgAction::SetTrue)
-        .help("Force initialization even if the repository is not empty")
-        .required(false);
-
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1036,7 +1028,6 @@ fn cli() -> Command {
                        .value_name("config"),
                )
                .arg(pg_version_arg.clone())
-                .arg(force_arg)
        )
        .subcommand(
            Command::new("timeline")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -450,7 +450,6 @@ impl Endpoint {

        // Create spec file
        let spec = ComputeSpec {
-            skip_pg_catalog_updates: false,
            format_version: 1.0,
            operation_uuid: None,
            cluster: Cluster {
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -364,7 +364,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -372,29 +372,11 @@ impl LocalEnv {
            "repository base path is missing"
        );

-        if base_path.exists() {
-            if force {
-                println!("removing all contents of '{}'", base_path.display());
-                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
-                // all contents inside. This helps if the developer symbol links another directory (i.e.,
-                // S3 local SSD) to the `.neon` base directory.
-                for entry in std::fs::read_dir(base_path)? {
-                    let entry = entry?;
-                    let path = entry.path();
-                    if path.is_dir() {
-                        fs::remove_dir_all(&path)?;
-                    } else {
-                        fs::remove_file(&path)?;
-                    }
-                }
-            } else {
-                bail!(
-                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
-                    base_path.display()
-                );
-            }
-        }
-
+        ensure!(
+            !base_path.exists(),
+            "directory '{}' already exists. Perhaps already initialized?",
+            base_path.display()
+        );
        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
@@ -410,7 +392,7 @@ impl LocalEnv {
            }
        }

-        fs::create_dir_all(base_path)?;
+        fs::create_dir(base_path)?;

        // Generate keypair for JWT.
        //
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -27,12 +27,6 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

-    /// An optinal hint that can be passed to speed up startup time if we know
-    /// that no pg catalog mutations (like role creation, database creation,
-    /// extension creation) need to be done on the actual database to start.
-    #[serde(default)] // Default false
-    pub skip_pg_catalog_updates: bool,
-
    // Information needed to connect to the storage layer.
    //
    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -111,8 +111,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<Download, DownloadError>;

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }

 pub struct Download {
@@ -225,14 +223,6 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
-
-    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => s.delete_objects(paths).await,
-            Self::AwsS3(s) => s.delete_objects(paths).await,
-            Self::Unreliable(s) => s.delete_objects(paths).await,
-        }
-    }
 }

 impl GenericRemoteStorage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -320,13 +320,6 @@ impl RemoteStorage for LocalFs {
            .await
            .map_err(|e| anyhow::anyhow!(e))?)
    }
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        for path in paths {
-            self.delete(path).await?
-        }
-        Ok(())
-    }
 }

 fn storage_metadata_path(original_path: &Path) -> PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -17,7 +17,6 @@ use aws_sdk_s3::{
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
-    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_http::body::SdkBody;
@@ -82,24 +81,12 @@ pub(super) mod metrics {
            .inc();
    }

-    pub fn inc_delete_objects(count: u64) {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
    pub fn inc_delete_object_fail() {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc();
    }

-    pub fn inc_delete_objects_fail(count: u64) {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
    pub fn inc_list_objects() {
        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
    }
@@ -409,34 +396,6 @@ impl RemoteStorage for S3Bucket {
        })
        .await
    }
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
-
-        let mut delete_objects = Vec::with_capacity(paths.len());
-        for path in paths {
-            let obj_id = ObjectIdentifier::builder()
-                .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build();
-            delete_objects.push(obj_id);
-        }
-
-        metrics::inc_delete_objects(paths.len() as u64);
-        self.client
-            .delete_objects()
-            .bucket(self.bucket_name.clone())
-            .delete(Delete::builder().set_objects(Some(delete_objects)).build())
-            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_delete_objects_fail(paths.len() as u64);
-                e
-            })?;
-        Ok(())
-    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let _guard = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -119,11 +119,4 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Delete(path.clone()))?;
        self.inner.delete(path).await
    }
-
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        for path in paths {
-            self.delete(path).await?
-        }
-        Ok(())
-    }
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -107,37 +107,6 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result
    Ok(())
 }

-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
-        .with_context(|| "RemotePath conversion")?;
-
-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
-
-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    Ok(())
-}
-
 fn ensure_logging_ready() {
    LOGGING_DONE.get_or_init(|| {
        utils::logging::init(
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,18 +1,19 @@
 use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use anyhow::Context;
+use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
 use hyper::Method;
-use hyper::{header::CONTENT_TYPE, Body, Request, Response};
+use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
 use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
+use std::net::TcpListener;
 use std::str::FromStr;

 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -347,6 +348,40 @@ pub fn check_permission_with(
    }
 }

+///
+/// Start listening for HTTP requests on given socket.
+///
+/// 'shutdown_future' can be used to stop. If the Future becomes
+/// ready, we stop listening for new requests, and the function returns.
+///
+pub fn serve_thread_main<S>(
+    router_builder: RouterBuilder<hyper::Body, ApiError>,
+    listener: TcpListener,
+    shutdown_future: S,
+) -> anyhow::Result<()>
+where
+    S: Future<Output = ()> + Send + Sync,
+{
+    info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
+
+    // Create a Service from the router above to handle incoming requests.
+    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
+
+    // Enter a single-threaded tokio runtime bound to the current thread
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
+
+    let _guard = runtime.enter();
+
+    let server = Server::from_tcp(listener)?
+        .serve(service)
+        .with_graceful_shutdown(shutdown_future);
+
+    runtime.block_on(server)?;
+
+    Ok(())
+}
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,23 +1,22 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{tests::LayerDescriptor, Layer, LayerFileName};
-use pageserver::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
+use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::Instant;
-use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

 use criterion::{black_box, criterion_group, criterion_main, Criterion};

-fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
-    let mut layer_map = LayerMap::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+    let mut layer_map = LayerMap::<LayerDescriptor>::default();

    let mut min_lsn = Lsn(u64::MAX);
    let mut max_lsn = Lsn(0);
@@ -34,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(layer.layer_desc().clone());
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -44,7 +43,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
 }

 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
    // For each image layer we query one of the pages contained, at LSN right
    // before the image layer was created. This gives us a somewhat uniform
    // coverage of both the lsn and key space because image layers have
@@ -70,7 +69,7 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {

 // Construct a partitioning for testing get_difficulty map when we
 // don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
+fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
    let mut parts = Vec::new();

    // We add a partition boundary at the start of each image layer,
@@ -210,15 +209,13 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = LayerDescriptor::from(PersistentLayerDesc::new_img(
-            TenantId::generate(),
-            TimelineId::generate(),
-            zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            Lsn(i),
-            false,
-            0,
-        ));
-        updates.insert_historic(layer.layer_desc().clone());
+        let layer = LayerDescriptor {
+            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            lsn: Lsn(i)..Lsn(i + 1),
+            is_incremental: false,
+            short_id: format!("Layer {}", i),
+        };
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -117,8 +117,7 @@ pub fn main() -> Result<()> {

        let mut lsn_diff = (lsn_end - lsn_start) as f32;
        let mut fill = Fill::None;
-        let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
-        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
        let mut lsn_offset = 0.0;

        // Fill in and thicken rectangle if it's an
@@ -129,7 +128,7 @@ pub fn main() -> Result<()> {
                num_images += 1;
                lsn_diff = 0.3;
                lsn_offset = -lsn_diff / 2.0;
-                ymargin = 0.05;
+                margin = 0.05;
                fill = Fill::Color(rgb(0, 0, 0));
            }
            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
@@ -138,10 +137,10 @@ pub fn main() -> Result<()> {
        println!(
            "    {}",
            rectangle(
-                key_start as f32 + stretch * xmargin,
-                stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
-                key_diff as f32 - stretch * 2.0 * xmargin,
-                stretch * (lsn_diff - 2.0 * ymargin)
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
            )
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -417,6 +417,16 @@ where
    // Also send zenith.signal file with extra bootstrap data.
    //
    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+        // Add neon_compute_spec_id.txt
+        if let Some(spec_id) = &self.timeline.compute_spec_id.lock().await.clone() {
+            self.ar
+                .append(
+                    &new_tar_header("neon_compute_spec_id.txt", spec_id.len() as u64)?,
+                    spec_id.as_bytes(),
+                )
+                .await?;
+        }
+
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -516,7 +516,7 @@ async fn collect_eviction_candidates(
            if !tl.is_active() {
                continue;
            }
-            let info = tl.get_local_layers_for_disk_usage_eviction().await;
+            let info = tl.get_local_layers_for_disk_usage_eviction();
            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -215,7 +215,7 @@ async fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

-    let mut info = build_timeline_info_common(timeline, ctx).await?;
+    let mut info = build_timeline_info_common(timeline, ctx)?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -233,7 +233,7 @@ async fn build_timeline_info(
    Ok(info)
 }

-async fn build_timeline_info_common(
+fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
@@ -264,7 +264,7 @@ async fn build_timeline_info_common(
            None
        }
    };
-    let current_physical_size = Some(timeline.layer_size_sum().await);
+    let current_physical_size = Some(timeline.layer_size_sum());
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

@@ -330,7 +330,6 @@ async fn timeline_create_handler(
            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
-                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
@@ -592,7 +591,7 @@ async fn tenant_status(
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum().await;
+            current_physical_size += timeline.layer_size_sum();
        }

        let state = tenant.current_state();
@@ -702,7 +701,7 @@ async fn layer_map_info_handler(
    check_permission(&request, Some(tenant_id))?;

    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let layer_map_info = timeline.layer_map_info(reset).await;
+    let layer_map_info = timeline.layer_map_info(reset);

    json_response(StatusCode::OK, layer_map_info)
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush().await?;
+            modification.flush()?;
        }
    }

    // We're done importing all the data files.
-    modification.commit().await?;
+    modification.commit()?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush().await?;
+                modification.flush()?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;

-    modification.commit().await?;
+    modification.commit()?;
    Ok(())
 }

@@ -594,7 +594,7 @@ async fn import_file(
        // zenith.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
-        let writer = modification.tline.writer().await;
+        let writer = modification.tline.writer();
        writer.finish_write(prev_lsn);

        debug!("imported zenith signal {}", prev_lsn);
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -53,33 +53,6 @@ pub enum StorageTimeOperation {
    CreateTenant,
 }

-pub static NUM_TIERS: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_storage_tiers_num",
-        "Number of sorted runs",
-        &["tenant_id", "timeline_id"],
-    )
-    .expect("failed to define a metric")
-});
-
-pub static NUM_COMPACTIONS: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_storage_compaction_num",
-        "Number of ongoing compactions",
-        &["tenant_id", "timeline_id"],
-    )
-    .expect("failed to define a metric")
-});
-
-pub static STORAGE_PHYSICAL_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_storage_physical_size_sum",
-        "Physical size of different types of storage files",
-        &["type", "tenant_id", "timeline_id"],
-    )
-    .expect("failed to define a metric")
-});
-
 pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
    register_counter_vec!(
        "pageserver_storage_operations_seconds_sum",
@@ -419,8 +392,6 @@ const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[

 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];

-pub const STORAGE_PHYSICAL_SIZE_FILE_TYPE: &[&str] = &["image", "delta", "partial-image"];
-
 pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
@@ -802,8 +773,6 @@ pub struct TimelineMetrics {
    pub persistent_bytes_written: IntCounter,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
-    pub num_tiers: IntGauge,
-    pub num_compactions: IntGauge,
 }

 impl TimelineMetrics {
@@ -869,12 +838,6 @@ impl TimelineMetrics {
            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
-        let num_tiers = NUM_TIERS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
-        let num_compactions = NUM_COMPACTIONS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();

        TimelineMetrics {
            tenant_id,
@@ -901,8 +864,6 @@ impl TimelineMetrics {
                evictions_with_low_residence_duration,
            ),
            read_num_fs_layers,
-            num_tiers,
-            num_compactions,
        }
    }
 }
@@ -923,7 +884,6 @@ impl Drop for TimelineMetrics {
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = STORAGE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -946,9 +906,6 @@ impl Drop for TimelineMetrics {
        for op in SMGR_QUERY_TIME_OPERATIONS {
            let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
        }
-        for ty in STORAGE_PHYSICAL_SIZE_FILE_TYPE {
-            let _ = STORAGE_PHYSICAL_SIZE.remove_label_values(&[ty, tenant_id, timeline_id]);
-        }
    }
 }

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -915,6 +915,27 @@ where
            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
                .await?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("set_compute_spec_id ") {
+            let (_, params_raw) = query_string.split_at("set_compute_spec_id ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
+            if params.len() != 3 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for set_compute_spec_id command"
+                )));
+            }
+
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let spec_id = params[2].to_string();
+
+            self.check_permission(Some(tenant_id))?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
+            *timeline.compute_spec_id.lock().await = Some(spec_id);
+
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -699,20 +699,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    #[cfg(test)]
-    pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
-        self.init_empty()?;
-        self.put_control_file(bytes::Bytes::from_static(
-            b"control_file contents do not matter",
-        ))
-        .context("put_control_file")?;
-        self.put_checkpoint(bytes::Bytes::from_static(
-            b"checkpoint_file contents do not matter",
-        ))
-        .context("put_checkpoint_file")?;
-        Ok(())
-    }
-
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -1122,7 +1108,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub async fn flush(&mut self) -> anyhow::Result<()> {
+    pub fn flush(&mut self) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1130,20 +1116,19 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }

-        let writer = self.tline.writer().await;
+        let writer = self.tline.writer();

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::new();
-        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(key) || is_slru_block_key(key) {
-                // This bails out on first error without modifying pending_updates.
-                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value).await?;
+        let mut result: anyhow::Result<()> = Ok(());
+        self.pending_updates.retain(|&key, value| {
+            if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
+                result = writer.put(key, self.lsn, value);
+                false
            } else {
-                retained_pending_updates.insert(key, value);
+                true
            }
-        }
-        self.pending_updates.extend(retained_pending_updates);
+        });
+        result?;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1158,17 +1143,17 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub async fn commit(&mut self) -> anyhow::Result<()> {
-        let writer = self.tline.writer().await;
+    pub fn commit(&mut self) -> anyhow::Result<()> {
+        let writer = self.tline.writer();
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value).await?;
+            writer.put(key, lsn, &value)?;
        }
        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn).await?;
+            writer.delete(key_range, lsn)?;
        }

        writer.finish_write(lsn);
@@ -1608,6 +1593,20 @@ fn is_slru_block_key(key: Key) -> bool {
        && key.field6 != 0xffffffff // and not SlruSegSize
 }

+#[cfg(test)]
+pub fn create_test_timeline(
+    tenant: &crate::tenant::Tenant,
+    timeline_id: utils::id::TimelineId,
+    pg_version: u32,
+    ctx: &RequestContext,
+) -> anyhow::Result<std::sync::Arc<Timeline>> {
+    let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
+    let mut m = tline.begin_modification(Lsn(8));
+    m.init_empty()?;
+    m.commit()?;
+    Ok(tline)
+}
+
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/layer_cache.rs
+++ b/pageserver/src/tenant/layer_cache.rs
@@ -1,198 +0,0 @@
-use super::storage_layer::{PersistentLayer, PersistentLayerDesc, PersistentLayerKey, RemoteLayer};
-use super::Timeline;
-use crate::metrics::{STORAGE_PHYSICAL_SIZE, STORAGE_PHYSICAL_SIZE_FILE_TYPE};
-use crate::tenant::layer_map::{self, LayerMap};
-use anyhow::Result;
-use std::sync::{Mutex, Weak};
-use std::{collections::HashMap, sync::Arc};
-use utils::id::{TenantId, TimelineId};
-
-pub struct LayerCache {
-    /// Layer removal lock.
-    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
-    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
-    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    pub layers_removal_lock: Arc<tokio::sync::RwLock<()>>,
-
-    /// We need this lock b/c we do not have any way to prevent GC/compaction from removing files in-use.
-    /// We need to do reference counting on Arc to prevent this from happening, and we can safely remove this lock.
-    pub layers_operation_lock: Arc<tokio::sync::RwLock<()>>,
-
-    /// Will be useful when we move evict / download to layer cache.
-    #[allow(unused)]
-    timeline: Weak<Timeline>,
-
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub tenant_id_str: String,
-    pub timeline_id_str: String,
-
-    mapping: Mutex<HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>>,
-}
-
-pub struct LayerInUseWrite(tokio::sync::OwnedRwLockWriteGuard<()>);
-
-pub struct LayerInUseRead(tokio::sync::OwnedRwLockReadGuard<()>);
-
-#[derive(Clone)]
-pub struct DeleteGuardRead(Arc<tokio::sync::OwnedRwLockReadGuard<()>>);
-
-#[derive(Clone)]
-pub struct DeleteGuardWrite(Arc<tokio::sync::OwnedRwLockWriteGuard<()>>);
-
-impl LayerCache {
-    pub fn new(timeline: Weak<Timeline>, tenant_id: TenantId, timeline_id: TimelineId) -> Self {
-        Self {
-            layers_operation_lock: Arc::new(tokio::sync::RwLock::new(())),
-            layers_removal_lock: Arc::new(tokio::sync::RwLock::new(())),
-            mapping: Mutex::new(HashMap::new()),
-            timeline: timeline,
-            tenant_id: tenant_id,
-            timeline_id: timeline_id,
-            tenant_id_str: tenant_id.to_string(),
-            timeline_id_str: timeline_id.to_string(),
-        }
-    }
-
-    pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
-        let guard = self.mapping.lock().unwrap();
-        guard.get(&desc.key()).expect("not found").clone()
-    }
-
-    /// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
-    /// we won't delete files that are being read.
-    pub async fn layer_in_use_write(&self) -> LayerInUseWrite {
-        LayerInUseWrite(self.layers_operation_lock.clone().write_owned().await)
-    }
-
-    /// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
-    /// we won't delete files that are being read.
-    pub async fn layer_in_use_read(&self) -> LayerInUseRead {
-        LayerInUseRead(self.layers_operation_lock.clone().read_owned().await)
-    }
-
-    /// Ensures only one of compaction / gc can happen at a time.
-    pub async fn delete_guard_read(&self) -> DeleteGuardRead {
-        DeleteGuardRead(Arc::new(
-            self.layers_removal_lock.clone().read_owned().await,
-        ))
-    }
-
-    /// Ensures only one of compaction / gc can happen at a time.
-    pub async fn delete_guard_write(&self) -> DeleteGuardWrite {
-        DeleteGuardWrite(Arc::new(
-            self.layers_removal_lock.clone().write_owned().await,
-        ))
-    }
-
-    /// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
-    pub fn remove_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
-        self.metrics_size_sub(&*layer);
-        let mut guard = self.mapping.lock().unwrap();
-        guard.remove(&layer.layer_desc().key());
-    }
-
-    /// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
-    pub fn populate_remote_when_init(&self, layer: Arc<RemoteLayer>) {
-        self.metrics_size_add(&*layer);
-        let mut guard = self.mapping.lock().unwrap();
-        guard.insert(layer.layer_desc().key(), layer);
-    }
-
-    /// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
-    pub fn populate_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
-        self.metrics_size_add(&*layer);
-        let mut guard = self.mapping.lock().unwrap();
-        guard.insert(layer.layer_desc().key(), layer);
-    }
-
-    /// Called within read path.
-    pub fn replace_and_verify(
-        &self,
-        expected: Arc<dyn PersistentLayer>,
-        new: Arc<dyn PersistentLayer>,
-    ) -> Result<()> {
-        let mut guard = self.mapping.lock().unwrap();
-
-        let key: PersistentLayerKey = expected.layer_desc().key();
-        let other = new.layer_desc().key();
-
-        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
-        let new_l0 = LayerMap::is_l0(new.layer_desc());
-
-        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
-            "replacing downloaded layer into layermap failed because layer was not found"
-        ));
-
-        anyhow::ensure!(
-            key == other,
-            "replacing downloaded layer into layermap failed because two layers have different keys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-             expected_l0 == new_l0,
-             "replacing downloaded layer into layermap failed because one layer is l0 while the other is not: {expected_l0} != {new_l0}"
-         );
-
-        if let Some(layer) = guard.get_mut(&expected.layer_desc().key()) {
-            anyhow::ensure!(
-                layer_map::compare_arced_layers(&expected, layer),
-                "replacing downloaded layer into layermap failed because another layer was found instead of expected, expected={expected:?}, new={new:?}",
-                expected = Arc::as_ptr(&expected),
-                new = Arc::as_ptr(layer),
-            );
-            *layer = new;
-            Ok(())
-        } else {
-            anyhow::bail!(
-                "replacing downloaded layer into layermap failed because layer was not found"
-            );
-        }
-    }
-
-    /// Called within write path. When compaction and image layer creation we will create new layers.
-    pub fn create_new_layer(&self, layer: Arc<dyn PersistentLayer>) {
-        self.metrics_size_add(&*layer);
-        let mut guard = self.mapping.lock().unwrap();
-        guard.insert(layer.layer_desc().key(), layer);
-    }
-
-    /// Called within write path. When GC and compaction we will remove layers and delete them on disk.
-    /// Will move logic to delete files here later.
-    pub fn delete_layer(&self, layer: Arc<dyn PersistentLayer>) {
-        self.metrics_size_sub(&*layer);
-        let mut guard = self.mapping.lock().unwrap();
-        guard.remove(&layer.layer_desc().key());
-    }
-
-    fn metrics_size_add(&self, layer: &dyn PersistentLayer) {
-        STORAGE_PHYSICAL_SIZE
-            .with_label_values(&[
-                Self::get_layer_type(layer),
-                &self.tenant_id_str,
-                &self.timeline_id_str,
-            ])
-            .add(layer.file_size() as i64);
-    }
-
-    fn metrics_size_sub(&self, layer: &dyn PersistentLayer) {
-        STORAGE_PHYSICAL_SIZE
-            .with_label_values(&[
-                Self::get_layer_type(layer),
-                &self.tenant_id_str,
-                &self.timeline_id_str,
-            ])
-            .sub(layer.file_size() as i64);
-    }
-
-    fn get_layer_type(layer: &dyn PersistentLayer) -> &'static str {
-        if layer.layer_desc().is_delta() {
-            &STORAGE_PHYSICAL_SIZE_FILE_TYPE[1]
-        } else if layer.layer_desc().is_incremental() {
-            &STORAGE_PHYSICAL_SIZE_FILE_TYPE[2]
-        } else {
-            &STORAGE_PHYSICAL_SIZE_FILE_TYPE[0]
-        }
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,23 +51,25 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
+use anyhow::Context;
 use anyhow::Result;
+use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;

 use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::{LayerKey, Replacement};
+pub use historic_layer_coverage::Replacement;

 use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
+use super::storage_layer::PersistentLayerKey;

 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
-#[derive(Default)]
-pub struct LayerMap {
+pub struct LayerMap<L: ?Sized> {
    //
    // 'open_layer' holds the current InMemoryLayer that is accepting new
    // records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -94,56 +96,22 @@ pub struct LayerMap {
    /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,

-    /// All sorted runs. For tiered compaction.
-    pub sorted_runs: SortedRuns,
+    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
+    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
+    /// RemoteLayer will be removed.
+    mapping: HashMap<PersistentLayerKey, Arc<L>>,
 }

-#[derive(Default)]
-pub struct SortedRuns {
-    pub runs: Vec<(usize, Vec<Arc<PersistentLayerDesc>>)>,
-    next_tier_id: usize,
-}
-
-impl SortedRuns {
-    /// Create a new sorted run and insert it at the top of the LSM tree.
-    pub fn create_new_run(&mut self, layers: Vec<Arc<PersistentLayerDesc>>) -> usize {
-        let tier_id = self.next_tier_id();
-        self.runs.insert(0, (tier_id, layers));
-        tier_id
-    }
-
-    /// Create a new sorted run and insert it at the bottom of the LSM tree.
-    pub fn create_new_bottom_run(&mut self, layers: Vec<Arc<PersistentLayerDesc>>) -> usize {
-        let tier_id = self.next_tier_id();
-        self.runs.push((tier_id, layers));
-        tier_id
-    }
-
-    pub fn compute_tier_sizes(&self) -> Vec<(usize, u64)> {
-        self.runs
-            .iter()
-            .map(|(tier_id, layers)| (*tier_id, layers.iter().map(|layer| layer.file_size()).sum()))
-            .collect::<Vec<_>>()
-    }
-
-    /// Remove a sorted run from the LSM tree.
-    pub fn remove_run(&mut self, tier_id: usize) {
-        self.runs.retain(|(id, _)| *id != tier_id);
-    }
-
-    /// Remove layers and the corresponding sorted runs.
-    pub fn insert_run_at(&mut self, idx: usize, layers: Vec<Arc<PersistentLayerDesc>>) {
-        unimplemented!()
-    }
-
-    pub fn num_of_tiers(&self) -> usize {
-        self.runs.len()
-    }
-
-    pub fn next_tier_id(&mut self) -> usize {
-        let ret = self.next_tier_id;
-        self.next_tier_id += 1;
-        ret
+impl<L: ?Sized> Default for LayerMap<L> {
+    fn default() -> Self {
+        Self {
+            open_layer: None,
+            next_open_layer_at: None,
+            frozen_layers: VecDeque::default(),
+            l0_delta_layers: Vec::default(),
+            historic: BufferedHistoricLayerCoverage::default(),
+            mapping: HashMap::default(),
+        }
    }
 }

@@ -152,30 +120,24 @@ impl SortedRuns {
 /// Batching historic layer insertions and removals is good for
 /// performance and this struct helps us do that correctly.
 #[must_use]
-pub struct BatchedUpdates<'a> {
+pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
    // While we hold this exclusive reference to the layer map the type checker
    // will prevent us from accidentally reading any unflushed updates.
-    layer_map: &'a mut LayerMap,
+    layer_map: &'a mut LayerMap<L>,
 }

 /// Provide ability to batch more updates while hiding the read
 /// API so we don't accidentally read without flushing.
-impl BatchedUpdates<'_> {
+impl<L> BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
    ///
    /// Insert an on-disk layer.
    ///
    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
-    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc) {
-        self.insert_historic_new(layer_desc) // insert into layer map without populating tiering structure
-    }
-
-    pub fn insert_historic_new(&mut self, layer_desc: PersistentLayerDesc) {
-        self.layer_map.insert_historic_noflush(layer_desc)
-    }
-
-    /// Get a reference to the current sorted runs.
-    pub fn sorted_runs(&mut self) -> &mut SortedRuns {
-        &mut self.layer_map.sorted_runs
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer_desc, layer)
    }

    ///
@@ -183,12 +145,31 @@ impl BatchedUpdates<'_> {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
-        self.remove_historic_new(layer_desc) // remove from layer map without populating tiering structure
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer_desc, layer)
    }

-    pub fn remove_historic_new(&mut self, layer_desc: PersistentLayerDesc) {
-        self.layer_map.remove_historic_noflush(layer_desc)
+    /// Replaces existing layer iff it is the `expected`.
+    ///
+    /// If the expected layer has been removed it will not be inserted by this function.
+    ///
+    /// Returned `Replacement` describes succeeding in replacement or the reason why it could not
+    /// be done.
+    ///
+    /// TODO replacement can be done without buffering and rebuilding layer map updates.
+    ///      One way to do that is to add a layer of indirection for returned values, so
+    ///      that we can replace values only by updating a hashmap.
+    pub fn replace_historic(
+        &mut self,
+        expected_desc: PersistentLayerDesc,
+        expected: &Arc<L>,
+        new_desc: PersistentLayerDesc,
+        new: Arc<L>,
+    ) -> anyhow::Result<Replacement<Arc<L>>> {
+        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
+
+        self.layer_map
+            .replace_historic_noflush(expected_desc, expected, new_desc, new)
    }

    // We will flush on drop anyway, but this method makes it
@@ -204,19 +185,25 @@ impl BatchedUpdates<'_> {
 // than panic later or read without flushing.
 //
 // TODO maybe warn if flush hasn't explicitly been called
-impl Drop for BatchedUpdates<'_> {
+impl<L> Drop for BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
    fn drop(&mut self) {
        self.layer_map.flush_updates();
    }
 }

 /// Return value of LayerMap::search
-pub struct SearchResult {
-    pub layer: Arc<PersistentLayerDesc>,
+pub struct SearchResult<L: ?Sized> {
+    pub layer: Arc<L>,
    pub lsn_floor: Lsn,
 }

-impl LayerMap {
+impl<L> LayerMap<L>
+where
+    L: ?Sized + Layer,
+{
    ///
    /// Find the latest layer (by lsn.end) that covers the given
    /// 'key', with lsn.start < 'end_lsn'.
@@ -248,29 +235,16 @@ impl LayerMap {
    /// NOTE: This only searches the 'historic' layers, *not* the
    /// 'open' and 'frozen' layers!
    ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
-        self.search_incremental(key, end_lsn, false)
-    }
-
-    pub fn search_incremental(
-        &self,
-        key: Key,
-        end_lsn: Lsn,
-        exclude_image: bool,
-    ) -> Option<SearchResult> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
        let latest_delta = version.delta_coverage.query(key.to_i128());
-        let latest_image = if exclude_image {
-            let version = self.historic.get().unwrap().get_version(end_lsn.0 - 2)?;
-            version.image_coverage.query(key.to_i128())
-        } else {
-            version.image_coverage.query(key.to_i128())
-        };
+        let latest_image = version.image_coverage.query(key.to_i128());

        match (latest_delta, latest_image) {
            (None, None) => None,
            (None, Some(image)) => {
-                let lsn_floor = image.get_lsn_range().end;
+                let lsn_floor = image.get_lsn_range().start;
+                let image = self.get_layer_from_mapping(&image.key()).clone();
                Some(SearchResult {
                    layer: image,
                    lsn_floor,
@@ -278,6 +252,7 @@ impl LayerMap {
            }
            (Some(delta), None) => {
                let lsn_floor = delta.get_lsn_range().start;
+                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                Some(SearchResult {
                    layer: delta,
                    lsn_floor,
@@ -288,13 +263,15 @@ impl LayerMap {
                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                let image_exact_match = img_lsn + 1 == end_lsn;
                if image_is_newer || image_exact_match {
+                    let image = self.get_layer_from_mapping(&image.key()).clone();
                    Some(SearchResult {
                        layer: image,
-                        lsn_floor: img_lsn + 1,
+                        lsn_floor: img_lsn,
                    })
                } else {
                    let lsn_floor =
                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                    Some(SearchResult {
                        layer: delta,
                        lsn_floor,
@@ -305,7 +282,7 @@ impl LayerMap {
    }

    /// Start a batch of updates, applied on drop
-    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
        BatchedUpdates { layer_map: self }
    }

@@ -315,32 +292,48 @@ impl LayerMap {
    /// Helper function for BatchedUpdates::insert_historic
    ///
    /// TODO(chi): remove L generic so that we do not need to pass layer object.
-    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
+    pub(self) fn insert_historic_noflush(
+        &mut self,
+        layer_desc: PersistentLayerDesc,
+        layer: Arc<L>,
+    ) {
+        self.mapping.insert(layer_desc.key(), layer.clone());
+
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc) {
+        if Self::is_l0(&layer) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

        self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&layer_desc),
+            historic_layer_coverage::LayerKey::from(&*layer),
            layer_desc.into(),
        );
    }

+    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
+        let layer = self
+            .mapping
+            .get(key)
+            .with_context(|| format!("{key:?}"))
+            .expect("inconsistent layer mapping");
+        layer
+    }
+
    ///
    /// Remove an on-disk layer from the map.
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
-        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc) {
+            .remove(historic_layer_coverage::LayerKey::from(&*layer));
+        if Self::is_l0(&layer) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
-            l0_delta_layers.retain(|other| other.key() != layer_key);
+            l0_delta_layers.retain(|other| {
+                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
+            });
            self.l0_delta_layers = l0_delta_layers;
            // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
            // there's a chance that the comparison fails at runtime due to it comparing (pointer,
@@ -351,6 +344,69 @@ impl LayerMap {
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
+        self.mapping.remove(&layer_desc.key());
+    }
+
+    pub(self) fn replace_historic_noflush(
+        &mut self,
+        expected_desc: PersistentLayerDesc,
+        expected: &Arc<L>,
+        new_desc: PersistentLayerDesc,
+        new: Arc<L>,
+    ) -> anyhow::Result<Replacement<Arc<L>>> {
+        let key = historic_layer_coverage::LayerKey::from(&**expected);
+        let other = historic_layer_coverage::LayerKey::from(&*new);
+
+        let expected_l0 = Self::is_l0(expected);
+        let new_l0 = Self::is_l0(&new);
+
+        anyhow::ensure!(
+            key == other,
+            "expected and new must have equal LayerKeys: {key:?} != {other:?}"
+        );
+
+        anyhow::ensure!(
+            expected_l0 == new_l0,
+            "expected and new must both be l0 deltas or neither should be: {expected_l0} != {new_l0}"
+        );
+
+        let l0_index = if expected_l0 {
+            // find the index in case replace worked, we need to replace that as well
+            let pos = self.l0_delta_layers.iter().position(|slot| {
+                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
+            });
+
+            if pos.is_none() {
+                return Ok(Replacement::NotFound);
+            }
+            pos
+        } else {
+            None
+        };
+
+        let new_desc = Arc::new(new_desc);
+        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
+            **existing == expected_desc
+        });
+
+        if let Replacement::Replaced { .. } = &replaced {
+            self.mapping.remove(&expected_desc.key());
+            self.mapping.insert(new_desc.key(), new);
+            if let Some(index) = l0_index {
+                self.l0_delta_layers[index] = new_desc;
+            }
+        }
+
+        let replaced = match replaced {
+            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
+            Replacement::NotFound => Replacement::NotFound,
+            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
+            Replacement::Unexpected(x) => {
+                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
+            }
+        };
+
+        Ok(replaced)
    }

    /// Helper function for BatchedUpdates::drop.
@@ -398,8 +454,10 @@ impl LayerMap {
        Ok(true)
    }

-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
-        self.historic.iter()
+    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
+        self.historic
+            .iter()
+            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
    }

    ///
@@ -414,7 +472,7 @@ impl LayerMap {
        &self,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
+    ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
        let version = match self.historic.get().unwrap().get_version(lsn.0) {
            Some(v) => v,
            None => return Ok(vec![]),
@@ -424,26 +482,36 @@ impl LayerMap {
        let end = key_range.end.to_i128();

        // Initialize loop variables
-        let mut coverage: Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> = vec![];
+        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
        let mut current_key = start;
        let mut current_val = version.image_coverage.query(start);

        // Loop through the change events and push intervals
        for (change_key, change_val) in version.image_coverage.range(start..end) {
            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((kr, current_val.take()));
+            coverage.push((
+                kr,
+                current_val
+                    .take()
+                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
+            ));
            current_key = change_key;
            current_val = change_val.clone();
        }

        // Add the final interval
        let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((kr, current_val.take()));
+        coverage.push((
+            kr,
+            current_val
+                .take()
+                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
+        ));

        Ok(coverage)
    }

-    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
+    pub fn is_l0(layer: &L) -> bool {
        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
    }

@@ -469,7 +537,7 @@ impl LayerMap {
    /// TODO The optimal number should probably be slightly higher than 1, but to
    ///      implement that we need to plumb a lot more context into this function
    ///      than just the current partition_range.
-    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
+    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
        // Case 1
        if !Self::is_l0(layer) {
            return true;
@@ -527,7 +595,9 @@ impl LayerMap {
                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                    let lr = lsn.start..val.get_lsn_range().start;
                    if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let base_count =
+                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
+                                as usize;
                        let new_limit = limit.map(|l| l - base_count);
                        let max_stacked_deltas_underneath =
                            self.count_deltas(&kr, &lr, new_limit)?;
@@ -550,7 +620,9 @@ impl LayerMap {
                let lr = lsn.start..val.get_lsn_range().start;

                if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let base_count =
+                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
+                            as usize;
                    let new_limit = limit.map(|l| l - base_count);
                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                    max_stacked_deltas = std::cmp::max(
@@ -700,8 +772,12 @@ impl LayerMap {
    }

    /// Return all L0 delta layers
-    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<PersistentLayerDesc>>> {
-        Ok(self.l0_delta_layers.to_vec())
+    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
+        Ok(self
+            .l0_delta_layers
+            .iter()
+            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
+            .collect())
    }

    /// debugging function to print out the contents of the layer map
@@ -719,95 +795,104 @@ impl LayerMap {
            frozen_layer.dump(verbose, ctx)?;
        }

-        println!("l0_deltas:");
-        for layer in &self.l0_delta_layers {
+        println!("historic_layers:");
+        for layer in self.iter_historic_layers() {
            layer.dump(verbose, ctx)?;
        }
-
-        println!("sorted_runs:");
-        for (lvl, (tier_id, layer)) in self.sorted_runs.runs.iter().enumerate() {
-            println!("tier {}", tier_id);
-            for layer in layer {
-                layer.dump(verbose, ctx)?;
-            }
-        }
-
        println!("End dump LayerMap");
        Ok(())
    }
-}

-/// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
-///
-/// Returns `true` if the two `Arc` point to the same layer, false otherwise.
-///
-/// If comparing persistent layers, ALWAYS compare the layer descriptor key.
-#[inline(always)]
-pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
-    // "dyn Trait" objects are "fat pointers" in that they have two components:
-    // - pointer to the object
-    // - pointer to the vtable
-    //
-    // rust does not provide a guarantee that these vtables are unique, but however
-    // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
-    // pointer and the vtable need to be equal.
-    //
-    // See: https://github.com/rust-lang/rust/issues/103763
-    //
-    // A future version of rust will most likely use this form below, where we cast each
-    // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
-    // not affect the comparison.
-    //
-    // See: https://github.com/rust-lang/rust/pull/106450
-    let left = Arc::as_ptr(left) as *const ();
-    let right = Arc::as_ptr(right) as *const ();
+    /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
+    ///
+    /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
+    #[inline(always)]
+    pub fn compare_arced_layers(left: &Arc<L>, right: &Arc<L>) -> bool {
+        // "dyn Trait" objects are "fat pointers" in that they have two components:
+        // - pointer to the object
+        // - pointer to the vtable
+        //
+        // rust does not provide a guarantee that these vtables are unique, but however
+        // `Arc::ptr_eq` as of writing (at least up to 1.67) uses a comparison where both the
+        // pointer and the vtable need to be equal.
+        //
+        // See: https://github.com/rust-lang/rust/issues/103763
+        //
+        // A future version of rust will most likely use this form below, where we cast each
+        // pointer into a pointer to unit, which drops the inaccessible vtable pointer, making it
+        // not affect the comparison.
+        //
+        // See: https://github.com/rust-lang/rust/pull/106450
+        let left = Arc::as_ptr(left) as *const ();
+        let right = Arc::as_ptr(right) as *const ();

-    left == right
+        left == right
+    }
 }

 #[cfg(test)]
 mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::{tests::LayerDescriptor, LayerFileName};
+    use super::{LayerMap, Replacement};
+    use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
    use std::str::FromStr;
    use std::sync::Arc;

    mod l0_delta_layers_updated {

-        use crate::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};
-
        use super::*;

        #[test]
-        #[ignore]
        fn for_full_range_delta() {
            // l0_delta_layers are used by compaction, and should observe all buffered updates
            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                true
+            )
        }

        #[test]
-        #[ignore]
        fn for_non_full_range_delta() {
            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
+                "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                // because not full range
+                false
+            )
        }

        #[test]
-        #[ignore]
        fn for_image() {
            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
+                "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                // code only checks if it is a full range layer, doesn't care about images, which must
+                // mean we should in practice never have full range images
+                false
+            )
+        }
+
+        #[test]
+        fn replacing_missing_l0_is_notfound() {
+            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
+            // however only happen for precondition failures.
+
+            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
+            let layer = LayerFileName::from_str(layer).unwrap();
+            let layer = LayerDescriptor::from(layer);
+
+            // same skeletan construction; see scenario below
+            let not_found = Arc::new(layer.clone());
+            let new_version = Arc::new(layer);
+
+            let mut map = LayerMap::default();
+
+            let res = map.batch_update().replace_historic(
+                not_found.get_persistent_layer_desc(),
+                &not_found,
+                new_version.get_persistent_layer_desc(),
+                new_version,
+            );
+
+            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
        }

        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
@@ -821,31 +906,46 @@ mod tests {

            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
+            assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));

            let expected_in_counts = (1, usize::from(expected_l0));

            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
+                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
+            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
+
+            let replaced = map
+                .batch_update()
+                .replace_historic(
+                    remote.get_persistent_layer_desc(),
+                    &remote,
+                    downloaded.get_persistent_layer_desc(),
+                    downloaded.clone(),
+                )
+                .expect("name derived attributes are the same");
+            assert!(
+                matches!(replaced, Replacement::Replaced { .. }),
+                "{replaced:?}"
            );
+            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);

            map.batch_update()
-                .remove_historic(downloaded.layer_desc().clone());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
+                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
+            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
        }

-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
+        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
+                .filter(|x| LayerMap::compare_arced_layers(x, layer))
                .count();
            let l0s = map
                .get_level0_deltas()
                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
+            let l0 = l0s
+                .iter()
+                .filter(|x| LayerMap::compare_arced_layers(x, layer))
+                .count();

            (historic, l0)
        }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -3,8 +3,6 @@ use std::ops::Range;

 use tracing::info;

-use crate::tenant::storage_layer::PersistentLayerDesc;
-
 use super::layer_coverage::LayerCoverageTuple;

 /// Layers in this module are identified and indexed by this data.
@@ -43,14 +41,14 @@ impl Ord for LayerKey {
    }
 }

-impl From<&PersistentLayerDesc> for LayerKey {
-    fn from(layer: &PersistentLayerDesc) -> Self {
+impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
+    fn from(layer: &'a L) -> Self {
        let kr = layer.get_key_range();
        let lr = layer.get_lsn_range();
        LayerKey {
            key: kr.start.to_i128()..kr.end.to_i128(),
            lsn: lr.start.0..lr.end.0,
-            is_image: !layer.is_delta,
+            is_image: !layer.is_incremental(),
        }
    }
 }
@@ -469,11 +467,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    ///
    /// Returns a `Replacement` value describing the outcome; only the case of
    /// `Replacement::Replaced` modifies the map and requires a rebuild.
-    ///
-    /// This function is unlikely to be used in the future because LayerMap now only records the
-    /// layer descriptors. Therefore, anything added to the layer map will only be removed or
-    /// added, and never replaced.
-    #[allow(dead_code)]
    pub fn replace<F>(
        &mut self,
        layer_key: &LayerKey,
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -1,325 +0,0 @@
-//! This module contains the encoding and decoding of the local manifest file.
-//!
-//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
-//! records the state of the storage engine. It contains a snapshot of the
-//! state and all operations proceeding that snapshot. The file begins with a
-//! header recording MANIFEST version number. After that, it contains a snapshot.
-//! The snapshot is followed by a list of operations. Each operation is a list
-//! of records. Each record is either an addition or a removal of a layer.
-//!
-//! With MANIFEST, we can:
-//!
-//! 1. recover state quickly by reading the file, potentially boosting the
-//!    startup speed.
-//! 2. ensure all operations are atomic and avoid corruption, solving issues
-//!    like redundant image layer and preparing us for future compaction
-//!    strategies.
-//!
-//! There is also a format for storing all layer files on S3, called
-//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
-//! records all operations as logs, and therefore we can easily replay the
-//! operations when recovering from crash, while ensuring those operations
-//! are atomic upon restart.
-//!
-//! Currently, this is not used in the system. Future refactors will ensure
-//! the storage state will be recorded in this file, and the system can be
-//! recovered from this file. This is tracked in
-//! https://github.com/neondatabase/neon/issues/4418
-
-use std::io::{self, Read, Write};
-
-use crate::virtual_file::VirtualFile;
-use anyhow::Result;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use crc32c::crc32c;
-use serde::{Deserialize, Serialize};
-use tracing::log::warn;
-use utils::lsn::Lsn;
-
-use super::storage_layer::PersistentLayerDesc;
-
-pub struct Manifest {
-    file: VirtualFile,
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct Snapshot {
-    pub layers: Vec<PersistentLayerDesc>,
-}
-
-/// serde by default encode this in tagged enum, and therefore it will be something
-/// like `{ "AddLayer": { ... } }`.
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Record {
-    AddLayer(PersistentLayerDesc),
-    RemoveLayer(PersistentLayerDesc),
-}
-
-/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
-const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
-const MANIFEST_VERSION: u64 = 1;
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct ManifestHeader {
-    magic_number: u64,
-    version: u64,
-}
-
-const MANIFEST_HEADER_LEN: usize = 16;
-
-impl ManifestHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
-        buf.put_u64(self.magic_number);
-        buf.put_u64(self.version);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
-        Self {
-            magic_number: buf.get_u64(),
-            version: buf.get_u64(),
-        }
-    }
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Operation {
-    /// A snapshot of the current state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
-    Snapshot(Snapshot, Lsn),
-    /// An atomic operation that changes the state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
-    /// This will only change when new L0 is flushed to the disk.
-    Operation(Vec<Record>, Lsn),
-}
-
-struct RecordHeader {
-    size: u32,
-    checksum: u32,
-}
-
-const RECORD_HEADER_LEN: usize = 8;
-
-impl RecordHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
-        buf.put_u32(self.size);
-        buf.put_u32(self.checksum);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
-        Self {
-            size: buf.get_u32(),
-            checksum: buf.get_u32(),
-        }
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum ManifestLoadError {
-    #[error("manifest header is corrupted")]
-    CorruptedManifestHeader,
-    #[error("unsupported manifest version: got {0}, expected {1}")]
-    UnsupportedVersion(u64, u64),
-    #[error("error when decoding record: {0}")]
-    DecodeRecord(serde_json::Error),
-    #[error("I/O error: {0}")]
-    Io(io::Error),
-}
-
-#[must_use = "Should check if the manifest is partially corrupted"]
-pub struct ManifestPartiallyCorrupted(bool);
-
-impl Manifest {
-    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
-    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
-        let mut manifest = Self { file };
-        manifest.append_manifest_header(ManifestHeader {
-            magic_number: MANIFEST_MAGIC_NUMBER,
-            version: MANIFEST_VERSION,
-        })?;
-        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
-        Ok(manifest)
-    }
-
-    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
-    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
-    /// backup the current one.
-    pub fn load(
-        mut file: VirtualFile,
-    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
-        let mut buf = vec![];
-        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
-
-        // Read manifest header
-        let mut buf = Bytes::from(buf);
-        if buf.remaining() < MANIFEST_HEADER_LEN {
-            return Err(ManifestLoadError::CorruptedManifestHeader);
-        }
-        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
-        buf.advance(MANIFEST_HEADER_LEN);
-        if header.version != MANIFEST_VERSION {
-            return Err(ManifestLoadError::UnsupportedVersion(
-                header.version,
-                MANIFEST_VERSION,
-            ));
-        }
-
-        // Read operations
-        let mut operations = Vec::new();
-        let corrupted = loop {
-            if buf.remaining() == 0 {
-                break false;
-            }
-            if buf.remaining() < RECORD_HEADER_LEN {
-                warn!("incomplete header when decoding manifest, could be corrupted");
-                break true;
-            }
-            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
-            let size = size as usize;
-            buf.advance(RECORD_HEADER_LEN);
-            if buf.remaining() < size {
-                warn!("incomplete data when decoding manifest, could be corrupted");
-                break true;
-            }
-            let data = &buf[..size];
-            if crc32c(data) != checksum {
-                warn!("checksum mismatch when decoding manifest, could be corrupted");
-                break true;
-            }
-            // if the following decode fails, we cannot use the manifest or safely ignore any record.
-            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
-            buf.advance(size);
-        };
-        Ok((
-            Self { file },
-            operations,
-            ManifestPartiallyCorrupted(corrupted),
-        ))
-    }
-
-    fn append_data(&mut self, data: &[u8]) -> Result<()> {
-        if data.len() >= u32::MAX as usize {
-            panic!("data too large");
-        }
-        let header = RecordHeader {
-            size: data.len() as u32,
-            checksum: crc32c(data),
-        };
-        let header = header.encode();
-        self.file.write_all(&header)?;
-        self.file.write_all(data)?;
-        self.file.sync_all()?;
-        Ok(())
-    }
-
-    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
-        let encoded = header.encode();
-        self.file.write_all(&encoded)?;
-        Ok(())
-    }
-
-    /// Add an operation to the manifest. The operation will be appended to the end of the file,
-    /// and the file will fsync.
-    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
-        let encoded = Vec::from(serde_json::to_string(&operation)?);
-        self.append_data(&encoded)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::fs::OpenOptions;
-
-    use crate::repository::Key;
-
-    use super::*;
-
-    #[test]
-    fn test_read_manifest() {
-        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
-        std::fs::create_dir_all(&testdir).unwrap();
-        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
-        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
-        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
-        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
-        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
-
-        // Write a manifest with a snapshot and some operations
-        let snapshot = Snapshot {
-            layers: vec![layer1, layer2],
-        };
-        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
-        manifest
-            .append_operation(Operation::Operation(
-                vec![Record::AddLayer(layer3.clone())],
-                Lsn::from(1),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the second time and write
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 2);
-        assert_eq!(
-            &operations[0],
-            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
-        );
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        manifest
-            .append_operation(Operation::Operation(
-                vec![
-                    Record::RemoveLayer(layer3.clone()),
-                    Record::AddLayer(layer4.clone()),
-                ],
-                Lsn::from(2),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the third time and verify
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 3);
-        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        assert_eq!(
-            &operations[2],
-            &Operation::Operation(
-                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
-                Lsn::from(2)
-            )
-        );
-    }
-}
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1392,12 +1392,7 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = runtime.block_on(harness.load());
            // create an empty timeline directory
-            let _ = runtime.block_on(tenant.create_test_timeline(
-                TIMELINE_ID,
-                Lsn(8),
-                DEFAULT_PG_VERSION,
-                &ctx,
-            ))?;
+            let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -176,10 +176,13 @@ impl LayerAccessStats {
    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn for_loading_layer(
-        layer_map_lock_held_witness: &BatchedUpdates<'_>,
+    pub(crate) fn for_loading_layer<L>(
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        status: LayerResidenceStatus,
-    ) -> Self {
+    ) -> Self
+    where
+        L: ?Sized + Layer,
+    {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
        new.record_residence_event(
            layer_map_lock_held_witness,
@@ -194,11 +197,14 @@ impl LayerAccessStats {
    /// The `new_status` is not recorded in `self`.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    pub(crate) fn clone_for_residence_change(
+    pub(crate) fn clone_for_residence_change<L>(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
+    ) -> LayerAccessStats
+    where
+        L: ?Sized + Layer,
+    {
        let clone = {
            let inner = self.0.lock().unwrap();
            inner.clone()
@@ -226,12 +232,14 @@ impl LayerAccessStats {
    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
    ///
-    pub(crate) fn record_residence_event(
+    pub(crate) fn record_residence_event<L>(
        &self,
-        _layer_map_lock_held_witness: &BatchedUpdates<'_>,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
-    ) {
+    ) where
+        L: ?Sized + Layer,
+    {
        let mut locked = self.0.lock().unwrap();
        locked.iter_mut().for_each(|inner| {
            inner
@@ -381,10 +389,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
 }

 /// Returned by [`Layer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;

 /// Returned by [`Layer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;

 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
@@ -465,125 +473,94 @@ pub fn downcast_remote_layer(
    }
 }

-pub mod tests {
-    use super::*;
+/// Holds metadata about a layer without any content. Used mostly for testing.
+///
+/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
+/// LayerDescriptor.
+#[derive(Clone, Debug)]
+pub struct LayerDescriptor {
+    pub key: Range<Key>,
+    pub lsn: Range<Lsn>,
+    pub is_incremental: bool,
+    pub short_id: String,
+}

-    /// Holds metadata about a layer without any content. Used mostly for testing.
-    ///
-    /// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
-    /// LayerDescriptor.
-    #[derive(Clone, Debug)]
-    pub struct LayerDescriptor {
-        base: PersistentLayerDesc,
+impl LayerDescriptor {
+    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
+    /// and the tenant / timeline id does not matter.
+    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
+        PersistentLayerDesc::new_delta(
+            TenantId::from_array([0; 16]),
+            TimelineId::from_array([0; 16]),
+            self.key.clone(),
+            self.lsn.clone(),
+            233,
+        )
+    }
+}
+
+impl Layer for LayerDescriptor {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key.clone()
    }

-    impl From<PersistentLayerDesc> for LayerDescriptor {
-        fn from(base: PersistentLayerDesc) -> Self {
-            Self { base }
-        }
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn.clone()
    }

-    impl Layer for LayerDescriptor {
-        fn get_value_reconstruct_data(
-            &self,
-            _key: Key,
-            _lsn_range: Range<Lsn>,
-            _reconstruct_data: &mut ValueReconstructState,
-            _ctx: &RequestContext,
-        ) -> Result<ValueReconstructResult> {
-            todo!("This method shouldn't be part of the Layer trait")
-        }
-
-        fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-            todo!()
-        }
-
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn get_key_range(&self) -> Range<Key> {
-            self.layer_desc().key_range.clone()
-        }
-
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn get_lsn_range(&self) -> Range<Lsn> {
-            self.layer_desc().lsn_range.clone()
-        }
-
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn is_incremental(&self) -> bool {
-            self.layer_desc().is_incremental
-        }
-
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn short_id(&self) -> String {
-            self.layer_desc().short_id()
-        }
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
    }

-    impl PersistentLayer for LayerDescriptor {
-        fn layer_desc(&self) -> &PersistentLayerDesc {
-            &self.base
-        }
-
-        fn local_path(&self) -> Option<PathBuf> {
-            unimplemented!()
-        }
-
-        fn iter(&self, _: &RequestContext) -> Result<LayerIter<'_>> {
-            unimplemented!()
-        }
-
-        fn key_iter(&self, _: &RequestContext) -> Result<LayerKeyIter<'_>> {
-            unimplemented!()
-        }
-
-        fn delete_resident_layer_file(&self) -> Result<()> {
-            unimplemented!()
-        }
-
-        fn info(&self, _: LayerAccessStatsReset) -> HistoricLayerInfo {
-            unimplemented!()
-        }
-
-        fn access_stats(&self) -> &LayerAccessStats {
-            unimplemented!()
-        }
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_data: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        todo!("This method shouldn't be part of the Layer trait")
    }

-    impl From<DeltaFileName> for LayerDescriptor {
-        fn from(value: DeltaFileName) -> Self {
-            LayerDescriptor {
-                base: PersistentLayerDesc::new_delta(
-                    TenantId::from_array([0; 16]),
-                    TimelineId::from_array([0; 16]),
-                    value.key_range,
-                    value.lsn_range,
-                    233,
-                ),
-            }
-        }
+    fn short_id(&self) -> String {
+        self.short_id.clone()
    }

-    impl From<ImageFileName> for LayerDescriptor {
-        fn from(value: ImageFileName) -> Self {
-            LayerDescriptor {
-                base: PersistentLayerDesc::new_img(
-                    TenantId::from_array([0; 16]),
-                    TimelineId::from_array([0; 16]),
-                    value.key_range,
-                    value.lsn,
-                    false,
-                    233,
-                ),
-            }
+    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        todo!()
+    }
+}
+
+impl From<DeltaFileName> for LayerDescriptor {
+    fn from(value: DeltaFileName) -> Self {
+        let short_id = value.to_string();
+        LayerDescriptor {
+            key: value.key_range,
+            lsn: value.lsn_range,
+            is_incremental: true,
+            short_id,
        }
    }
+}

-    impl From<LayerFileName> for LayerDescriptor {
-        fn from(value: LayerFileName) -> Self {
-            match value {
-                LayerFileName::Delta(d) => Self::from(d),
-                LayerFileName::Image(i) => Self::from(i),
-            }
+impl From<ImageFileName> for LayerDescriptor {
+    fn from(value: ImageFileName) -> Self {
+        let short_id = value.to_string();
+        let lsn = value.lsn_as_range();
+        LayerDescriptor {
+            key: value.key_range,
+            lsn,
+            is_incremental: false,
+            short_id,
+        }
+    }
+}
+
+impl From<LayerFileName> for LayerDescriptor {
+    fn from(value: LayerFileName) -> Self {
+        match value {
+            LayerFileName::Delta(d) => Self::from(d),
+            LayerFileName::Image(i) => Self::from(i),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -37,7 +37,6 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,6 +46,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
+use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tracing::*;

 use utils::{
@@ -184,7 +184,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<DeltaLayerInner>,
+    inner: RwLock<DeltaLayerInner>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -201,17 +201,21 @@ impl std::fmt::Debug for DeltaLayer {
 }

 pub struct DeltaLayerInner {
+    /// If false, the fields below have not been loaded into memory yet.
+    loaded: bool,
+
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    file: Option<FileBlockReader<VirtualFile>>,
 }

 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
+            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
@@ -222,14 +226,13 @@ impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size
+            self.desc.lsn_range.end
        );

        if !verbose {
@@ -243,7 +246,7 @@ impl Layer for DeltaLayer {
            inner.index_start_blk, inner.index_root_blk
        );

-        let file = &inner.file;
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -312,7 +315,7 @@ impl Layer for DeltaLayer {
            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

            // Scan the page versions backwards, starting from `lsn`.
-            let file = &inner.file;
+            let file = inner.file.as_ref().unwrap();
            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
@@ -497,22 +500,51 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
+    fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<RwLockReadGuard<DeltaLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
-        // Quick exit if already loaded
-        self.inner
-            .get_or_try_init(|| self.load_inner())
-            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
+        loop {
+            // Quick exit if already loaded
+            let inner = self.inner.read().unwrap();
+            if inner.loaded {
+                return Ok(inner);
+            }
+
+            // Need to open the file and load the metadata. Upgrade our lock to
+            // a write lock. (Or rather, release and re-lock in write mode.)
+            drop(inner);
+            let inner = self.inner.write().unwrap();
+            if !inner.loaded {
+                self.load_inner(inner).with_context(|| {
+                    format!("Failed to load delta layer {}", self.path().display())
+                })?;
+            } else {
+                // Another thread loaded it while we were not holding the lock.
+            }
+
+            // We now have the file open and loaded. There's no function to do
+            // that in the std library RwLock, so we have to release and re-lock
+            // in read mode. (To be precise, the lock guard was moved in the
+            // above call to `load_inner`, so it's already been released). And
+            // while we do that, another thread could unload again, so we have
+            // to re-check and retry if that happens.
+        }
    }

-    fn load_inner(&self) -> Result<DeltaLayerInner> {
+    fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
        let path = self.path();

-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
-
+        // Open the file if it's not open already.
+        if inner.file.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.file = Some(FileBlockReader::new(file));
+        }
+        let file = inner.file.as_mut().unwrap();
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -539,13 +571,13 @@ impl DeltaLayer {
            }
        }

+        inner.index_start_blk = actual_summary.index_start_blk;
+        inner.index_root_blk = actual_summary.index_root_blk;
+
        debug!("loaded from {}", &path.display());

-        Ok(DeltaLayerInner {
-            file,
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-        })
+        inner.loaded = true;
+        Ok(())
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -567,7 +599,12 @@ impl DeltaLayer {
                file_size,
            ),
            access_stats,
-            inner: once_cell::sync::OnceCell::new(),
+            inner: RwLock::new(DeltaLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        }
    }

@@ -594,7 +631,12 @@ impl DeltaLayer {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: once_cell::sync::OnceCell::new(),
+            inner: RwLock::new(DeltaLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
+            }),
        })
    }

@@ -758,7 +800,12 @@ impl DeltaLayerWriterInner {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: once_cell::sync::OnceCell::new(),
+            inner: RwLock::new(DeltaLayerInner {
+                loaded: false,
+                file: None,
+                index_start_blk,
+                index_root_blk,
+            }),
        };

        // fsync the file
@@ -893,13 +940,13 @@ struct DeltaValueIter<'a> {
    reader: BlockCursor<Adapter<'a>>,
 }

-struct Adapter<'a>(&'a DeltaLayerInner);
+struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);

 impl<'a> BlockReader for Adapter<'a> {
    type BlockLease = PageReadGuard<'static>;

    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.read_blk(blknum)
+        self.0.file.as_ref().unwrap().read_blk(blknum)
    }
 }

@@ -912,8 +959,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
 }

 impl<'a> DeltaValueIter<'a> {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -986,8 +1033,8 @@ impl Iterator for DeltaKeyIter {
 }

 impl<'a> DeltaKeyIter {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1027,21 +1074,3 @@ impl<'a> DeltaKeyIter {
        Ok(iter)
    }
 }
-
-#[cfg(test)]
-mod test {
-    use super::DeltaKeyIter;
-    use super::DeltaLayer;
-    use super::DeltaValueIter;
-
-    // We will soon need the iters to be send in the compaction code.
-    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
-    // Cf https://github.com/neondatabase/neon/issues/4471
-    #[test]
-    fn is_send() {
-        fn assert_send<T: Send>() {}
-        assert_send::<DeltaLayer>();
-        assert_send::<DeltaValueIter>();
-        assert_send::<DeltaKeyIter>();
-    }
-}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -153,13 +153,12 @@ impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} size {} ----",
+            "----- image layer for ten {} tli {} key {}-{} at {} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
-            self.lsn,
-            self.desc.file_size
+            self.lsn
        );

        if !verbose {
@@ -213,11 +212,7 @@ impl Layer for ImageLayer {
            reconstruct_state.img = Some((self.lsn, value));
            Ok(ValueReconstructResult::Complete)
        } else {
-            if self.desc.is_incremental {
-                Ok(ValueReconstructResult::Continue)
-            } else {
-                Ok(ValueReconstructResult::Missing)
-            }
+            Ok(ValueReconstructResult::Missing)
        }
    }

@@ -409,7 +404,7 @@ impl ImageLayer {
                timeline_id,
                filename.key_range.clone(),
                filename.lsn,
-                true,
+                false,
                file_size,
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
@@ -441,7 +436,7 @@ impl ImageLayer {
                summary.timeline_id,
                summary.key_range,
                summary.lsn,
-                true,
+                false,
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
@@ -486,14 +481,12 @@ struct ImageLayerWriterInner {
    path: PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,
+    key_range: Range<Key>,
    lsn: Lsn,
    is_incremental: bool,

    blob_writer: WriteBlobWriter<VirtualFile>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
-
-    start_key: Key,
-    last_key: Option<Key>,
 }

 impl ImageLayerWriterInner {
@@ -504,8 +497,8 @@ impl ImageLayerWriterInner {
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
+        key_range: &Range<Key>,
        lsn: Lsn,
-        start_key: Key,
        is_incremental: bool,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
@@ -515,7 +508,7 @@ impl ImageLayerWriterInner {
            timeline_id,
            tenant_id,
            &ImageFileName {
-                key_range: start_key..start_key, // TODO(chi): use number instead of dummy range
+                key_range: key_range.clone(),
                lsn,
            },
        );
@@ -537,12 +530,11 @@ impl ImageLayerWriterInner {
            path,
            timeline_id,
            tenant_id,
+            key_range: key_range.clone(),
            lsn,
            tree: tree_builder,
            blob_writer,
            is_incremental,
-            start_key,
-            last_key: None,
        };

        Ok(writer)
@@ -554,14 +546,7 @@ impl ImageLayerWriterInner {
    /// The page versions must be appended in blknum order.
    ///
    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
-        if cfg!(debug_assertions) {
-            ensure!(key >= self.start_key);
-            if let Some(last_key) = self.last_key.as_ref() {
-                ensure!(last_key < &key);
-            }
-            self.last_key = Some(key.clone());
-        }
-
+        ensure!(self.key_range.contains(&key));
        let off = self.blob_writer.write_blob(img)?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -574,7 +559,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    fn finish(self, end_key: Key) -> anyhow::Result<ImageLayer> {
+    fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -587,15 +572,13 @@ impl ImageLayerWriterInner {
            file.write_all(buf.as_ref())?;
        }

-        let key_range = self.start_key.clone()..end_key;
-
        // Fill in the summary on blk 0
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
            tenant_id: self.tenant_id,
            timeline_id: self.timeline_id,
-            key_range: key_range.clone(),
+            key_range: self.key_range.clone(),
            lsn: self.lsn,
            index_start_blk,
            index_root_blk,
@@ -610,7 +593,7 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_id,
            self.timeline_id,
-            key_range.clone(),
+            self.key_range.clone(),
            self.lsn,
            self.is_incremental, // for now, image layer ALWAYS covers the full range
            metadata.len(),
@@ -644,7 +627,7 @@ impl ImageLayerWriterInner {
            self.timeline_id,
            self.tenant_id,
            &ImageFileName {
-                key_range,
+                key_range: self.key_range.clone(),
                lsn: self.lsn,
            },
        );
@@ -654,10 +637,6 @@ impl ImageLayerWriterInner {

        Ok(layer)
    }
-
-    fn size(&self) -> u64 {
-        self.blob_writer.size() + self.tree.borrow_writer().size()
-    }
 }

 /// A builder object for constructing a new image layer.
@@ -694,7 +673,7 @@ impl ImageLayerWriter {
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
-        start_key: Key,
+        key_range: &Range<Key>,
        lsn: Lsn,
        is_incremental: bool,
    ) -> anyhow::Result<ImageLayerWriter> {
@@ -703,8 +682,8 @@ impl ImageLayerWriter {
                conf,
                timeline_id,
                tenant_id,
+                key_range,
                lsn,
-                start_key,
                is_incremental,
            )?),
        })
@@ -722,12 +701,8 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub fn finish(mut self, end_key: Key) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish(end_key)
-    }
-
-    pub fn size(&self) -> u64 {
-        self.inner.as_ref().unwrap().size()
+    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish()
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -11,7 +11,6 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::timeline::ENABLE_TIERED_COMPACTION;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
@@ -150,8 +149,8 @@ impl Layer for InMemoryLayer {
            .unwrap_or_default();

        println!(
-            "----- in-memory layer LSNs {}-{} ----",
-            self.start_lsn, end_str,
+            "----- in-memory layer for tli {} LSNs {}-{} ----",
+            self.timeline_id, self.start_lsn, end_str,
        );

        if !verbose {
@@ -305,7 +304,7 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys

        Ok(())
@@ -342,18 +341,11 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().unwrap();

-        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
-        keys.sort_by_key(|k| k.0);
-
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_id,
-            if ENABLE_TIERED_COMPACTION {
-                keys.first().unwrap().0.clone()
-            } else {
-                Key::MIN
-            },
+            Key::MIN,
            self.start_lsn..inner.end_lsn.unwrap(),
        )?;

@@ -361,6 +353,9 @@ impl InMemoryLayer {

        let mut cursor = inner.file.block_cursor();

+        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        keys.sort_by_key(|k| k.0);
+
        for (key, vec_map) in keys.iter() {
            let key = **key;
            // Write all page versions
@@ -371,11 +366,7 @@ impl InMemoryLayer {
            }
        }

-        let delta_layer = delta_layer_writer.finish(if ENABLE_TIERED_COMPACTION {
-            keys.last().unwrap().0.next()
-        } else {
-            Key::MAX
-        })?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -9,12 +9,10 @@ use crate::{context::RequestContext, repository::Key};

 use super::{DeltaFileName, ImageFileName, LayerFileName};

-use serde::{Deserialize, Serialize};
-
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct PersistentLayerDesc {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -52,19 +50,6 @@ impl PersistentLayerDesc {
        self.filename().file_name()
    }

-    #[cfg(test)]
-    pub fn new_test(key_range: Range<Key>) -> Self {
-        Self {
-            tenant_id: TenantId::generate(),
-            timeline_id: TimelineId::generate(),
-            key_range,
-            lsn_range: Lsn(0)..Lsn(1),
-            is_delta: false,
-            is_incremental: false,
-            file_size: 0,
-        }
-    }
-
    pub fn new_img(
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -173,14 +158,13 @@ impl PersistentLayerDesc {

    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- layer for keys {}-{} lsn {}-{} size {} is_delta {} is_incremental {} ----",
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenant_id,
+            self.timeline_id,
            self.key_range.start,
            self.key_range.end,
            self.lsn_range.start,
-            self.lsn_range.end,
-            self.file_size,
-            self.is_delta,
-            self.is_incremental
+            self.lsn_range.end
        );

        Ok(())
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -218,12 +218,15 @@ impl RemoteLayer {
    }

    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer(
+    pub fn create_downloaded_layer<L>(
        &self,
-        layer_map_lock_held_witness: &BatchedUpdates<'_>,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        conf: &'static PageServerConf,
        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
+    ) -> Arc<dyn PersistentLayer>
+    where
+        L: ?Sized + Layer,
+    {
        if self.desc.is_delta {
            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,43 +14,35 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;

-use super::timeline::ENABLE_TIERED_COMPACTION;
-
 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
    tenant: &Arc<Tenant>,
    background_jobs_can_start: Option<&completion::Barrier>,
 ) {
    let tenant_id = tenant.tenant_id;
-    // start two compaction threads
-    let range = if ENABLE_TIERED_COMPACTION { 0..4 } else { 0..1 };
-    for cpt_id in range {
-        task_mgr::spawn(
-            BACKGROUND_RUNTIME.handle(),
-            TaskKind::Compaction,
-            Some(tenant_id),
-            None,
-            &format!("compactor for tenant {tenant_id}"),
-            false,
-            {
-                let tenant = Arc::clone(tenant);
-                let background_jobs_can_start = background_jobs_can_start.cloned();
-                async move {
-                    let cancel = task_mgr::shutdown_token();
-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()) },
-                        _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
-                    };
-                    compaction_loop(tenant, cancel)
-                        .instrument(
-                            info_span!("compaction_loop", tenant_id = %tenant_id, cpt_id = %cpt_id),
-                        )
-                        .await;
-                    Ok(())
-                }
-            },
-        );
-    }
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        None,
+        &format!("compactor for tenant {tenant_id}"),
+        false,
+        {
+            let tenant = Arc::clone(tenant);
+            let background_jobs_can_start = background_jobs_can_start.cloned();
+            async move {
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                compaction_loop(tenant, cancel)
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
+        },
+    );
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -197,11 +197,9 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let guard = self.layers.read().await;
-            let (layers, _) = &*guard;
+            let layers = self.layers.read().unwrap();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
-                let hist_layer = self.lcache.get_from_desc(&hist_layer);
                if hist_layer.is_remote_layer() {
                    continue;
                }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1324,8 +1324,7 @@ mod tests {
    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
-            .await
+            .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
            .expect("Failed to create an empty timeline for dummy wal connection manager");

        ConnectionManagerState {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -304,15 +304,12 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }

-        timeline
-            .check_checkpoint_distance()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to check checkpoint distance for timeline {}",
-                    timeline.timeline_id
-                )
-            })?;
+        timeline.check_checkpoint_distance().with_context(|| {
+            format!(
+                "Failed to check checkpoint distance for timeline {}",
+                timeline.timeline_id
+            )
+        })?;

        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn =
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {

        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit().await?;
+        modification.commit()?;

        Ok(())
    }
@@ -1171,6 +1171,7 @@ impl<'a> WalIngest<'a> {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::pgdatadir_mapping::create_test_timeline;
    use crate::tenant::harness::*;
    use crate::tenant::Timeline;
    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1199,7 +1200,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit().await?;
+        m.commit()?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

        Ok(walingest)
@@ -1208,9 +1209,7 @@ mod tests {
    #[tokio::test]
    async fn test_relsize() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
@@ -1218,22 +1217,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -1319,7 +1318,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -1361,7 +1360,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1374,7 +1373,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1399,7 +1398,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1429,16 +1428,14 @@ mod tests {
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1457,7 +1454,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit().await?;
+        m.commit()?;

        // Check that rel is not visible anymore
        assert_eq!(
@@ -1475,7 +1472,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1500,9 +1497,7 @@ mod tests {
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        // Create a 20 MB relation (the size is arbitrary)
@@ -1514,7 +1509,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit()?;

        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1559,7 +1554,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;

        // Check reported size and contents after truncation
        assert_eq!(
@@ -1608,7 +1603,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit()?;

        assert_eq!(
            tline
@@ -1642,9 +1637,7 @@ mod tests {
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-            .await?;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;

        let mut lsn = 0x10;
@@ -1655,7 +1648,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit()?;
        }

        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1671,7 +1664,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -1684,7 +1677,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit()?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -1700,7 +1693,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit()?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -855,31 +855,35 @@ files = [

 [[package]]
 name = "cryptography"
-version = "41.0.0"
+version = "39.0.1"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 category = "main"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.6"
 files = [
-    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"},
-    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"},
-    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"},
-    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"},
-    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"},
-    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"},
-    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"},
-    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"},
-    {file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"},
-    {file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"},
-    {file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"},
-    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"},
-    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"},
-    {file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"},
-    {file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"},
-    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"},
-    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"},
-    {file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"},
-    {file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"},
+    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
+    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
+    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
+    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
+    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
+    {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
+    {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
+    {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
+    {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
 ]

 [package.dependencies]
@@ -888,12 +892,12 @@ cffi = ">=1.12"
 [package.extras]
 docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
-nox = ["nox"]
-pep8test = ["black", "check-sdist", "mypy", "ruff"]
-sdist = ["build"]
+pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
+sdist = ["setuptools-rust (>=0.11.4)"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
+test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
 test-randomorder = ["pytest-randomly"]
+tox = ["tox"]

 [[package]]
 name = "docker"
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -3,19 +3,15 @@
 //
 use anyhow::{bail, Context, Result};
 use clap::Parser;
-use futures::future::BoxFuture;
-use futures::stream::FuturesUnordered;
-use futures::{FutureExt, StreamExt};
 use remote_storage::RemoteStorageConfig;
-use tokio::runtime::Handle;
-use tokio::signal::unix::{signal, SignalKind};
-use tokio::task::JoinError;
 use toml_edit::Document;
+use utils::signals::ShutdownSignals;

 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use std::thread;
 use std::time::Duration;
 use storage_broker::Uri;
 use tokio::sync::mpsc;
@@ -24,21 +20,22 @@ use tracing::*;
 use utils::pid_file;

 use metrics::set_build_info_metric;
+use safekeeper::broker;
+use safekeeper::control_file;
 use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PG_LISTEN_ADDR,
 };
+use safekeeper::http;
+use safekeeper::remove_wal;
+use safekeeper::wal_backup;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
-use safekeeper::{broker, WAL_SERVICE_RUNTIME};
-use safekeeper::{control_file, BROKER_RUNTIME};
-use safekeeper::{http, WAL_REMOVER_RUNTIME};
-use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
-use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::JwtAuth;
 use utils::{
+    http::endpoint,
    id::NodeId,
    logging::{self, LogFormat},
    project_git_version,
@@ -107,6 +104,10 @@ struct Args {
    /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
    #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
    max_offloader_lag: u64,
+    /// Number of threads for wal backup runtime, by default number of cores
+    /// available to the system.
+    #[arg(long)]
+    wal_backup_threads: Option<usize>,
    /// Number of max parallel WAL segments to be offloaded to remote storage.
    #[arg(long, default_value = "5")]
    wal_backup_parallel_jobs: usize,
@@ -120,14 +121,9 @@ struct Args {
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
-    /// Run everything in single threaded current thread runtime, might be
-    /// useful for debugging.
-    #[arg(long)]
-    current_thread_runtime: bool,
 }

-#[tokio::main(flavor = "current_thread")]
-async fn main() -> anyhow::Result<()> {
+fn main() -> anyhow::Result<()> {
    let args = Args::parse();

    if let Some(addr) = args.dump_control_file {
@@ -187,10 +183,10 @@ async fn main() -> anyhow::Result<()> {
        heartbeat_timeout: args.heartbeat_timeout,
        remote_storage: args.remote_storage,
        max_offloader_lag_bytes: args.max_offloader_lag,
+        backup_runtime_threads: args.wal_backup_threads,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
        auth,
-        current_thread_runtime: args.current_thread_runtime,
    };

    // initialize sentry if SENTRY_DSN is provided
@@ -198,14 +194,10 @@ async fn main() -> anyhow::Result<()> {
        Some(GIT_VERSION.into()),
        &[("node_id", &conf.my_id.to_string())],
    );
-    start_safekeeper(conf).await
+    start_safekeeper(conf)
 }

-/// Result of joining any of main tasks: upper error means task failed to
-/// complete, e.g. panicked, inner is error produced by task itself.
-type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
-
-async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // Prevent running multiple safekeepers on the same directory
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file =
@@ -216,18 +208,14 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // we need to release the lock file only when the current process is gone
    std::mem::forget(lock_file);

-    info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
-    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
-        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
+        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
        e
    })?;

-    info!(
-        "starting safekeeper HTTP service on {}",
-        conf.listen_http_addr
-    );
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
-        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
+    info!("starting safekeeper on {}", conf.listen_pg_addr);
+    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
+        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
        e
    })?;

@@ -236,88 +224,71 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
    metrics::register_internal(Box::new(timeline_collector))?;

+    let mut threads = vec![];
    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);

    // Load all timelines from disk to memory.
    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;

-    // Keep handles to main tasks to die if any of them disappears.
-    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
-        FuturesUnordered::new();
+    let conf_ = conf.clone();
+    threads.push(
+        thread::Builder::new()
+            .name("http_endpoint_thread".into())
+            .spawn(|| {
+                let router = http::make_router(conf_);
+                endpoint::serve_thread_main(
+                    router,
+                    http_listener,
+                    std::future::pending(), // never shut down
+                )
+                .unwrap();
+            })?,
+    );
+
+    let conf_cloned = conf.clone();
+    let safekeeper_thread = thread::Builder::new()
+        .name("WAL service thread".into())
+        .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
+        .unwrap();
+
+    threads.push(safekeeper_thread);

    let conf_ = conf.clone();
-    // Run everything in current thread rt, if asked.
-    if conf.current_thread_runtime {
-        info!("running in current thread runtime");
-    }
-    let current_thread_rt = conf
-        .current_thread_runtime
-        .then(|| Handle::try_current().expect("no runtime in main"));
-    let wal_service_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
-        .spawn(wal_service::task_main(conf_, pg_listener))
-        // wrap with task name for error reporting
-        .map(|res| ("WAL service main".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_service_handle));
+    threads.push(
+        thread::Builder::new()
+            .name("broker thread".into())
+            .spawn(|| {
+                broker::thread_main(conf_);
+            })?,
+    );

    let conf_ = conf.clone();
-    let http_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| HTTP_RUNTIME.handle())
-        .spawn(http::task_main(conf_, http_listener))
-        .map(|res| ("HTTP service main".to_owned(), res));
-    tasks_handles.push(Box::pin(http_handle));
+    threads.push(
+        thread::Builder::new()
+            .name("WAL removal thread".into())
+            .spawn(|| {
+                remove_wal::thread_main(conf_);
+            })?,
+    );

-    let conf_ = conf.clone();
-    let broker_task_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| BROKER_RUNTIME.handle())
-        .spawn(broker::task_main(conf_).instrument(info_span!("broker")))
-        .map(|res| ("broker main".to_owned(), res));
-    tasks_handles.push(Box::pin(broker_task_handle));
-
-    let conf_ = conf.clone();
-    let wal_remover_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
-        .spawn(remove_wal::task_main(conf_))
-        .map(|res| ("WAL remover".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_remover_handle));
-
-    let conf_ = conf.clone();
-    let wal_backup_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
-        .spawn(wal_backup::wal_backup_launcher_task_main(
-            conf_,
-            wal_backup_launcher_rx,
-        ))
-        .map(|res| ("WAL backup launcher".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_backup_handle));
+    threads.push(
+        thread::Builder::new()
+            .name("WAL backup launcher thread".into())
+            .spawn(move || {
+                wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
+            })?,
+    );

    set_build_info_metric(GIT_VERSION);
+    // TODO: put more thoughts into handling of failed threads
+    // We should catch & die if they are in trouble.

-    // TODO: update tokio-stream, convert to real async Stream with
-    // SignalStream, map it to obtain missing signal name, combine streams into
-    // single stream we can easily sit on.
-    let mut sigquit_stream = signal(SignalKind::quit())?;
-    let mut sigint_stream = signal(SignalKind::interrupt())?;
-    let mut sigterm_stream = signal(SignalKind::terminate())?;
-
-    tokio::select! {
-        Some((task_name, res)) = tasks_handles.next()=> {
-            error!("{} task failed: {:?}, exiting", task_name, res);
-            std::process::exit(1);
-        }
-        // On any shutdown signal, log receival and exit. Additionally, handling
-        // SIGQUIT prevents coredump.
-        _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"),
-        _ = sigint_stream.recv() => info!("received SIGINT, terminating"),
-        _ = sigterm_stream.recv() => info!("received SIGTERM, terminating")
-
-    };
-    std::process::exit(0);
+    // On any shutdown signal, log receival and exit. Additionally, handling
+    // SIGQUIT prevents coredump.
+    ShutdownSignals::handle(|signal| {
+        info!("received {}, terminating", signal.name());
+        std::process::exit(0);
+    })
 }

 /// Determine safekeeper id.
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -8,7 +8,7 @@ use anyhow::Error;
 use anyhow::Result;

 use storage_broker::parse_proto_ttid;
-
+use storage_broker::proto::broker_service_client::BrokerServiceClient;
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::Request;
@@ -16,7 +16,7 @@ use storage_broker::Request;
 use std::time::Duration;
 use std::time::Instant;
 use tokio::task::JoinHandle;
-use tokio::time::sleep;
+use tokio::{runtime, time::sleep};
 use tracing::*;

 use crate::metrics::BROKER_ITERATION_TIMELINES;
@@ -29,10 +29,23 @@ use crate::SafeKeeperConf;
 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;

+pub fn thread_main(conf: SafeKeeperConf) {
+    let runtime = runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let _enter = info_span!("broker").entered();
+    info!("started, broker endpoint {:?}", conf.broker_endpoint);
+
+    runtime.block_on(async {
+        main_loop(conf).await;
+    });
+}
+
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let mut client =
-        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
+    let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);

    let outbound = async_stream::stream! {
@@ -42,27 +55,20 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
            // sensitive and there is no risk of deadlock as we don't await while
            // lock is held.
            let now = Instant::now();
-            let all_tlis = GlobalTimelines::get_all();
-            let mut n_pushed_tlis = 0;
-            for tli in &all_tlis {
-                // filtering alternative futures::stream::iter(all_tlis)
-                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
-                // doesn't look better, and I'm not sure how to do that without collect.
-                if !tli.is_active().await {
-                    continue;
-                }
-                let sk_info = tli.get_safekeeper_info(&conf).await;
+            let mut active_tlis = GlobalTimelines::get_all();
+            active_tlis.retain(|tli| tli.is_active());
+            for tli in &active_tlis {
+                let sk_info = tli.get_safekeeper_info(&conf);
                yield sk_info;
                BROKER_PUSHED_UPDATES.inc();
-                n_pushed_tlis += 1;
            }
            let elapsed = now.elapsed();

            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
-            BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64);
+            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);

            if elapsed > push_interval / 2 {
-                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed);
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
            }

            sleep(push_interval).await;
@@ -119,13 +125,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    bail!("end of stream");
 }

-pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    info!("started, broker endpoint {:?}", conf.broker_endpoint);
-
+async fn main_loop(conf: SafeKeeperConf) {
    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
    let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
    let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
-
    // Selecting on JoinHandles requires some squats; is there a better way to
    // reap tasks individually?

--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,10 +2,9 @@

 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use tokio::fs::{self, File};
-use tokio::io::AsyncWriteExt;

-use std::io::Read;
+use std::fs::{self, File, OpenOptions};
+use std::io::{Read, Write};
 use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::time::Instant;
@@ -27,10 +26,9 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();

 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
-#[async_trait::async_trait]
 pub trait Storage: Deref<Target = SafeKeeperState> {
    /// Persist safekeeper state on disk and update internal state.
-    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
+    fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;

    /// Timestamp of last persist.
    fn last_persist_at(&self) -> Instant;
@@ -84,7 +82,7 @@ impl FileStorage {
    /// Check the magic/version in the on-disk data and deserialize it, if possible.
    fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
        // Read the version independent part
-        let magic = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
+        let magic = buf.read_u32::<LittleEndian>()?;
        if magic != SK_MAGIC {
            bail!(
                "bad control file magic: {:X}, expected {:X}",
@@ -92,7 +90,7 @@ impl FileStorage {
                SK_MAGIC
            );
        }
-        let version = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
+        let version = buf.read_u32::<LittleEndian>()?;
        if version == SK_FORMAT_VERSION {
            let res = SafeKeeperState::des(buf)?;
            return Ok(res);
@@ -112,7 +110,7 @@ impl FileStorage {

    /// Read in the control file.
    pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
-        let mut control_file = std::fs::OpenOptions::new()
+        let mut control_file = OpenOptions::new()
            .read(true)
            .write(true)
            .open(&control_file_path)
@@ -161,31 +159,30 @@ impl Deref for FileStorage {
    }
 }

-#[async_trait::async_trait]
 impl Storage for FileStorage {
    /// persists state durably to underlying storage
    /// for description see https://lwn.net/Articles/457667/
-    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
+    fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
        let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();

        // write data to safekeeper.control.partial
        let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
-        let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
+        let mut control_partial = File::create(&control_partial_path).with_context(|| {
            format!(
                "failed to create partial control file at: {}",
                &control_partial_path.display()
            )
        })?;
        let mut buf: Vec<u8> = Vec::new();
-        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
-        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
+        buf.write_u32::<LittleEndian>(SK_MAGIC)?;
+        buf.write_u32::<LittleEndian>(SK_FORMAT_VERSION)?;
        s.ser_into(&mut buf)?;

        // calculate checksum before resize
        let checksum = crc32c::crc32c(&buf);
        buf.extend_from_slice(&checksum.to_le_bytes());

-        control_partial.write_all(&buf).await.with_context(|| {
+        control_partial.write_all(&buf).with_context(|| {
            format!(
                "failed to write safekeeper state into control file at: {}",
                control_partial_path.display()
@@ -194,7 +191,7 @@ impl Storage for FileStorage {

        // fsync the file
        if !self.conf.no_sync {
-            control_partial.sync_all().await.with_context(|| {
+            control_partial.sync_all().with_context(|| {
                format!(
                    "failed to sync partial control file at {}",
                    control_partial_path.display()
@@ -205,22 +202,21 @@ impl Storage for FileStorage {
        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);

        // rename should be atomic
-        fs::rename(&control_partial_path, &control_path).await?;
+        fs::rename(&control_partial_path, &control_path)?;
        // this sync is not required by any standard but postgres does this (see durable_rename)
        if !self.conf.no_sync {
-            let new_f = File::open(&control_path).await?;
-            new_f.sync_all().await.with_context(|| {
-                format!(
-                    "failed to sync control file at: {}",
-                    &control_path.display()
-                )
-            })?;
+            File::open(&control_path)
+                .and_then(|f| f.sync_all())
+                .with_context(|| {
+                    format!(
+                        "failed to sync control file at: {}",
+                        &control_path.display()
+                    )
+                })?;

            // fsync the directory (linux specific)
-            let tli_dir = File::open(&self.timeline_dir).await?;
-            tli_dir
-                .sync_all()
-                .await
+            File::open(&self.timeline_dir)
+                .and_then(|f| f.sync_all())
                .context("failed to sync control file directory")?;
        }

@@ -240,6 +236,7 @@ mod test {
    use super::*;
    use crate::{safekeeper::SafeKeeperState, SafeKeeperConf};
    use anyhow::Result;
+    use std::fs;
    use utils::{id::TenantTimelineId, lsn::Lsn};

    fn stub_conf() -> SafeKeeperConf {
@@ -250,75 +247,59 @@ mod test {
        }
    }

-    async fn load_from_control_file(
+    fn load_from_control_file(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
-            .await
-            .expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
        Ok((
            FileStorage::restore_new(ttid, conf)?,
            FileStorage::load_control_file_conf(conf, ttid)?,
        ))
    }

-    async fn create(
+    fn create(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
-            .await
-            .expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
        let state = SafeKeeperState::empty();
        let storage = FileStorage::create_new(ttid, conf, state.clone())?;
        Ok((storage, state))
    }

-    #[tokio::test]
-    async fn test_read_write_safekeeper_state() {
+    #[test]
+    fn test_read_write_safekeeper_state() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) =
-                create(&conf, &ttid).await.expect("failed to create state");
+            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state");
            // change something
            state.commit_lsn = Lsn(42);
-            storage
-                .persist(&state)
-                .await
-                .expect("failed to persist state");
+            storage.persist(&state).expect("failed to persist state");
        }

-        let (_, state) = load_from_control_file(&conf, &ttid)
-            .await
-            .expect("failed to read state");
+        let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state");
        assert_eq!(state.commit_lsn, Lsn(42));
    }

-    #[tokio::test]
-    async fn test_safekeeper_state_checksum_mismatch() {
+    #[test]
+    fn test_safekeeper_state_checksum_mismatch() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) =
-                create(&conf, &ttid).await.expect("failed to read state");
+            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state");

            // change something
            state.commit_lsn = Lsn(42);
-            storage
-                .persist(&state)
-                .await
-                .expect("failed to persist state");
+            storage.persist(&state).expect("failed to persist state");
        }
        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
-        let mut data = fs::read(&control_path).await.unwrap();
+        let mut data = fs::read(&control_path).unwrap();
        data[0] += 1; // change the first byte of the file to fail checksum validation
-        fs::write(&control_path, &data)
-            .await
-            .expect("failed to write control file");
+        fs::write(&control_path, &data).expect("failed to write control file");

-        match load_from_control_file(&conf, &ttid).await {
+        match load_from_control_file(&conf, &ttid) {
            Err(err) => assert!(err
                .to_string()
                .contains("safekeeper control file checksum mismatch")),
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -121,7 +121,7 @@ pub struct FileInfo {
 }

 /// Build debug dump response, using the provided [`Args`] filters.
-pub async fn build(args: Args) -> Result<Response> {
+pub fn build(args: Args) -> Result<Response> {
    let start_time = Utc::now();
    let timelines_count = GlobalTimelines::timelines_count();

@@ -155,7 +155,7 @@ pub async fn build(args: Args) -> Result<Response> {
        }

        let control_file = if args.dump_control_file {
-            let mut state = tli.get_state().await.1;
+            let mut state = tli.get_state().1;
            if !args.dump_term_history {
                state.acceptor_state.term_history = TermHistory(vec![]);
            }
@@ -165,7 +165,7 @@ pub async fn build(args: Args) -> Result<Response> {
        };

        let memory = if args.dump_memory {
-            Some(tli.memory_dump().await)
+            Some(tli.memory_dump())
        } else {
            None
        };
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler {

        let lsn = if self.is_walproposer_recovery() {
            // walproposer should get all local WAL until flush_lsn
-            tli.get_flush_lsn().await
+            tli.get_flush_lsn()
        } else {
            // other clients shouldn't get any uncommitted WAL
-            tli.get_state().await.0.commit_lsn
+            tli.get_state().0.commit_lsn
        }
        .to_string();

-        let sysid = tli.get_state().await.1.server.system_id.to_string();
+        let sysid = tli.get_state().1.server.system_id.to_string();
        let lsn_bytes = lsn.as_bytes();
        let tli = PG_TLI.to_string();
        let tli_bytes = tli.as_bytes();
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -2,18 +2,3 @@ pub mod routes;
 pub use routes::make_router;

 pub use safekeeper_api::models;
-
-use crate::SafeKeeperConf;
-
-pub async fn task_main(
-    conf: SafeKeeperConf,
-    http_listener: std::net::TcpListener,
-) -> anyhow::Result<()> {
-    let router = make_router(conf)
-        .build()
-        .map_err(|err| anyhow::anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?;
-    server.serve(service).await?;
-    Ok(()) // unreachable
-}
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -13,7 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
-use utils::http::endpoint::request_span;
+use tokio::task::JoinError;

 use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
@@ -116,8 +116,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(ttid.tenant_id))?;

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    let (inmem, state) = tli.get_state().await;
-    let flush_lsn = tli.get_flush_lsn().await;
+    let (inmem, state) = tli.get_state();
+    let flush_lsn = tli.get_flush_lsn();

    let epoch = state.acceptor_state.get_epoch(flush_lsn);
    let term_history = state
@@ -232,11 +232,13 @@ async fn timeline_delete_force_handler(
    );
    check_permission(&request, Some(ttid.tenant_id))?;
    ensure_no_body(&mut request).await?;
-    // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
-    // error handling here when we're able to.
-    let resp = GlobalTimelines::delete_force(&ttid)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let resp = tokio::task::spawn_blocking(move || {
+        // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
+        // error handling here when we're able to.
+        GlobalTimelines::delete_force(&ttid).map_err(ApiError::InternalServerError)
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
    json_response(StatusCode::OK, resp)
 }

@@ -248,11 +250,14 @@ async fn tenant_delete_force_handler(
    let tenant_id = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    ensure_no_body(&mut request).await?;
-    // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
-    // Using an `InternalServerError` should be fixed when the types support it
-    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let delete_info = tokio::task::spawn_blocking(move || {
+        // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
+        // Using an `InternalServerError` should be fixed when the types support it
+        GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
+            .map_err(ApiError::InternalServerError)
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
    json_response(
        StatusCode::OK,
        delete_info
@@ -348,9 +353,11 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
        timeline_id,
    };

-    let resp = debug_dump::build(args)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let resp = tokio::task::spawn_blocking(move || {
+        debug_dump::build(args).map_err(ApiError::InternalServerError)
+    })
+    .await
+    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;

    // TODO: use streaming response
    json_response(StatusCode::OK, resp)
@@ -379,32 +386,29 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
    router
        .data(Arc::new(conf))
        .data(auth)
-        .get("/v1/status", |r| request_span(r, status_handler))
+        .get("/v1/status", status_handler)
        // Will be used in the future instead of implicit timeline creation
-        .post("/v1/tenant/timeline", |r| {
-            request_span(r, timeline_create_handler)
-        })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            request_span(r, timeline_status_handler)
-        })
-        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            request_span(r, timeline_delete_force_handler)
-        })
-        .delete("/v1/tenant/:tenant_id", |r| {
-            request_span(r, tenant_delete_force_handler)
-        })
-        .post("/v1/pull_timeline", |r| {
-            request_span(r, timeline_pull_handler)
-        })
+        .post("/v1/tenant/timeline", timeline_create_handler)
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id",
+            timeline_status_handler,
+        )
+        .delete(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id",
+            timeline_delete_force_handler,
+        )
+        .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
+        .post("/v1/pull_timeline", timeline_pull_handler)
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
-            |r| request_span(r, timeline_files_handler),
+            timeline_files_handler,
        )
        // for tests
-        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
-            request_span(r, record_safekeeper_info)
-        })
-        .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
+        .post(
+            "/v1/record_safekeeper_info/:tenant_id/:timeline_id",
+            record_safekeeper_info,
+        )
+        .get("/v1/debug_dump", dump_debug_handler)
 }

 #[cfg(test)]
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -73,12 +73,12 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(

    // if send_proposer_elected is true, we need to update local history
    if append_request.send_proposer_elected {
-        send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?;
+        send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?;
    }

-    let inserted_wal = append_logical_message(&tli, append_request).await?;
+    let inserted_wal = append_logical_message(&tli, append_request)?;
    let response = AppendResult {
-        state: tli.get_state().await.1,
+        state: tli.get_state().1,
        inserted_wal,
    };
    let response_data = serde_json::to_vec(&response)
@@ -114,9 +114,9 @@ async fn prepare_safekeeper(
    .await
 }

-async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
+fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
    // add new term to existing history
-    let history = tli.get_state().await.1.acceptor_state.term_history;
+    let history = tli.get_state().1.acceptor_state.term_history;
    let history = history.up_to(lsn.checked_sub(1u64).unwrap());
    let mut history_entries = history.0;
    history_entries.push(TermSwitchEntry { term, lsn });
@@ -129,7 +129,7 @@ async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> any
        timeline_start_lsn: lsn,
    });

-    tli.process_msg(&proposer_elected_request).await?;
+    tli.process_msg(&proposer_elected_request)?;
    Ok(())
 }

@@ -142,12 +142,12 @@ pub struct InsertedWAL {

 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
-pub async fn append_logical_message(
+pub fn append_logical_message(
    tli: &Arc<Timeline>,
    msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
    let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
-    let sk_state = tli.get_state().await.1;
+    let sk_state = tli.get_state().1;

    let begin_lsn = msg.begin_lsn;
    let end_lsn = begin_lsn + wal_data.len() as u64;
@@ -171,7 +171,7 @@ pub async fn append_logical_message(
        wal_data: Bytes::from(wal_data),
    });

-    let response = tli.process_msg(&append_request).await?;
+    let response = tli.process_msg(&append_request)?;

    let append_response = match response {
        Some(AcceptorProposerMessage::AppendResponse(resp)) => resp,
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,6 +1,4 @@
-use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;
-use tokio::runtime::Runtime;

 use std::path::PathBuf;
 use std::time::Duration;
@@ -38,6 +36,7 @@ pub mod defaults {
        DEFAULT_PG_LISTEN_PORT,
    };

+    pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
    pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
    pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
 }
@@ -61,10 +60,10 @@ pub struct SafeKeeperConf {
    pub heartbeat_timeout: Duration,
    pub remote_storage: Option<RemoteStorageConfig>,
    pub max_offloader_lag_bytes: u64,
+    pub backup_runtime_threads: Option<usize>,
    pub backup_parallel_jobs: usize,
    pub wal_backup_enabled: bool,
    pub auth: Option<Arc<JwtAuth>>,
-    pub current_thread_runtime: bool,
 }

 impl SafeKeeperConf {
@@ -93,64 +92,12 @@ impl SafeKeeperConf {
                .parse()
                .expect("failed to parse default broker endpoint"),
            broker_keepalive_interval: Duration::from_secs(5),
+            backup_runtime_threads: None,
            wal_backup_enabled: true,
            backup_parallel_jobs: 1,
            auth: None,
            heartbeat_timeout: Duration::new(5, 0),
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
-            current_thread_runtime: false,
        }
    }
 }
-
-// Tokio runtimes.
-pub static WAL_SERVICE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("WAL service worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create WAL service runtime")
-});
-
-pub static HTTP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("HTTP worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create WAL service runtime")
-});
-
-pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("broker worker")
-        .worker_threads(2) // there are only 2 tasks, having more threads doesn't make sense
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});
-
-pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("WAL remover")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});
-
-pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("WAL backup worker")
-        .enable_all()
-        .build()
-        .expect("Failed to create WAL backup runtime")
-});
-
-pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
-    tokio::runtime::Builder::new_multi_thread()
-        .thread_name("metric shifter")
-        .worker_threads(1)
-        .enable_all()
-        .build()
-        .expect("Failed to create broker runtime")
-});
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -7,7 +7,6 @@ use std::{

 use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
 use anyhow::Result;
-use futures::Future;
 use metrics::{
    core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
    proto::MetricFamily,
@@ -293,17 +292,14 @@ impl WalStorageMetrics {
    }
 }

-/// Accepts async function that returns empty anyhow result, and returns the duration of its execution.
-pub async fn time_io_closure<E: Into<anyhow::Error>>(
-    closure: impl Future<Output = Result<(), E>>,
-) -> Result<f64> {
+/// Accepts a closure that returns a result, and returns the duration of the closure.
+pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result<f64> {
    let start = std::time::Instant::now();
-    closure.await.map_err(|e| e.into())?;
+    closure()?;
    Ok(start.elapsed().as_secs_f64())
 }

 /// Metrics for a single timeline.
-#[derive(Clone)]
 pub struct FullTimelineInfo {
    pub ttid: TenantTimelineId,
    pub ps_feedback: PageserverFeedback,
@@ -579,19 +575,13 @@ impl Collector for TimelineCollector {
        let timelines = GlobalTimelines::get_all();
        let timelines_count = timelines.len();

-        // Prometheus Collector is sync, and data is stored under async lock. To
-        // bridge the gap with a crutch, collect data in spawned thread with
-        // local tokio runtime.
-        let infos = std::thread::spawn(|| {
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .build()
-                .expect("failed to create rt");
-            rt.block_on(collect_timeline_metrics())
-        })
-        .join()
-        .expect("collect_timeline_metrics thread panicked");
+        for arc_tli in timelines {
+            let tli = arc_tli.info_for_metrics();
+            if tli.is_none() {
+                continue;
+            }
+            let tli = tli.unwrap();

-        for tli in &infos {
            let tenant_id = tli.ttid.tenant_id.to_string();
            let timeline_id = tli.ttid.timeline_id.to_string();
            let labels = &[tenant_id.as_str(), timeline_id.as_str()];
@@ -692,15 +682,3 @@ impl Collector for TimelineCollector {
        mfs
    }
 }
-
-async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
-    let mut res = vec![];
-    let timelines = GlobalTimelines::get_all();
-
-    for tli in timelines {
-        if let Some(info) = tli.info_for_metrics().await {
-            res.push(info);
-        }
-    }
-    res
-}
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -231,7 +231,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
    info!(
        "Loaded timeline {}, flush_lsn={}",
        ttid,
-        tli.get_flush_lsn().await
+        tli.get_flush_lsn()
    );

    Ok(Response {
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -18,14 +18,15 @@ use postgres_backend::QueryError;
 use pq_proto::BeMessage;
 use std::net::SocketAddr;
 use std::sync::Arc;
+use std::thread;
+use std::thread::JoinHandle;
 use tokio::io::AsyncRead;
 use tokio::io::AsyncWrite;
 use tokio::sync::mpsc::channel;
 use tokio::sync::mpsc::error::TryRecvError;
 use tokio::sync::mpsc::Receiver;
 use tokio::sync::mpsc::Sender;
-use tokio::task;
-use tokio::task::JoinHandle;
+use tokio::task::spawn_blocking;
 use tokio::time::Duration;
 use tokio::time::Instant;
 use tracing::*;
@@ -96,7 +97,7 @@ impl SafekeeperPostgresHandler {
                Err(res.expect_err("no error with WalAcceptor not spawn"))
            }
            Some(handle) => {
-                let wal_acceptor_res = handle.await;
+                let wal_acceptor_res = handle.join();

                // If there was any network error, return it.
                res?;
@@ -106,7 +107,7 @@ impl SafekeeperPostgresHandler {
                    Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination
                    Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))),
                    Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!(
-                        "WalAcceptor task panicked",
+                        "WalAcceptor thread panicked",
                    ))),
                }
            }
@@ -153,12 +154,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
            }
        };

-        *self.acceptor_handle = Some(WalAcceptor::spawn(
-            tli.clone(),
-            msg_rx,
-            reply_tx,
-            self.conn_id,
-        ));
+        *self.acceptor_handle = Some(
+            WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id)
+                .context("spawn WalAcceptor thread")?,
+        );

        // Forward all messages to WalAcceptor
        read_network_loop(self.pgb_reader, msg_tx, next_msg).await
@@ -227,19 +226,28 @@ impl WalAcceptor {
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
        conn_id: ConnectionId,
-    ) -> JoinHandle<anyhow::Result<()>> {
-        task::spawn(async move {
-            let mut wa = WalAcceptor {
-                tli,
-                msg_rx,
-                reply_tx,
-            };
+    ) -> anyhow::Result<JoinHandle<anyhow::Result<()>>> {
+        let thread_name = format!("WAL acceptor {}", tli.ttid);
+        thread::Builder::new()
+            .name(thread_name)
+            .spawn(move || -> anyhow::Result<()> {
+                let mut wa = WalAcceptor {
+                    tli,
+                    msg_rx,
+                    reply_tx,
+                };

-            let span_ttid = wa.tli.ttid; // satisfy borrow checker
-            wa.run()
-                .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
-                .await
-        })
+                let runtime = tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()?;
+
+                let span_ttid = wa.tli.ttid; // satisfy borrow checker
+                runtime.block_on(
+                    wa.run()
+                        .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)),
+                )
+            })
+            .map_err(anyhow::Error::from)
    }

    /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
@@ -273,7 +281,7 @@ impl WalAcceptor {
                while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
                    let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);

-                    if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
+                    if let Some(reply) = self.tli.process_msg(&noflush_msg)? {
                        if self.reply_tx.send(reply).await.is_err() {
                            return Ok(()); // chan closed, streaming terminated
                        }
@@ -292,12 +300,10 @@ impl WalAcceptor {
                }

                // flush all written WAL to the disk
-                self.tli
-                    .process_msg(&ProposerAcceptorMessage::FlushWAL)
-                    .await?
+                self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?
            } else {
                // process message other than AppendRequest
-                self.tli.process_msg(&next_msg).await?
+                self.tli.process_msg(&next_msg)?
            };

            if let Some(reply) = reply_msg {
@@ -320,8 +326,8 @@ impl Drop for ComputeConnectionGuard {
        let tli = self.timeline.clone();
        // tokio forbids to call blocking_send inside the runtime, and see
        // comments in on_compute_disconnect why we call blocking_send.
-        tokio::spawn(async move {
-            if let Err(e) = tli.on_compute_disconnect().await {
+        spawn_blocking(move || {
+            if let Err(e) = tli.on_compute_disconnect() {
                error!("failed to unregister compute connection: {}", e);
            }
        });
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -1,36 +1,29 @@
 //! Thread removing old WAL.

-use std::time::Duration;
+use std::{thread, time::Duration};

-use tokio::time::sleep;
 use tracing::*;

 use crate::{GlobalTimelines, SafeKeeperConf};

-pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
+pub fn thread_main(conf: SafeKeeperConf) {
    let wal_removal_interval = Duration::from_millis(5000);
    loop {
        let tlis = GlobalTimelines::get_all();
        for tli in &tlis {
-            if !tli.is_active().await {
+            if !tli.is_active() {
                continue;
            }
            let ttid = tli.ttid;
-            if let Err(e) = tli
-                .maybe_persist_control_file()
-                .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
-                .await
-            {
+            let _enter =
+                info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered();
+            if let Err(e) = tli.maybe_pesist_control_file() {
                warn!("failed to persist control file: {e}");
            }
-            if let Err(e) = tli
-                .remove_old_wal(conf.wal_backup_enabled)
-                .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
-                .await
-            {
-                error!("failed to remove WAL: {}", e);
+            if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) {
+                warn!("failed to remove WAL: {}", e);
            }
        }
-        sleep(wal_removal_interval).await;
+        thread::sleep(wal_removal_interval)
    }
 }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -568,27 +568,25 @@ where

    /// Process message from proposer and possibly form reply. Concurrent
    /// callers must exclude each other.
-    pub async fn process_msg(
+    pub fn process_msg(
        &mut self,
        msg: &ProposerAcceptorMessage,
    ) -> Result<Option<AcceptorProposerMessage>> {
        match msg {
-            ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
-            ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
-            ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
-            ProposerAcceptorMessage::AppendRequest(msg) => {
-                self.handle_append_request(msg, true).await
-            }
+            ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg),
+            ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg),
+            ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg),
+            ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true),
            ProposerAcceptorMessage::NoFlushAppendRequest(msg) => {
-                self.handle_append_request(msg, false).await
+                self.handle_append_request(msg, false)
            }
-            ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
+            ProposerAcceptorMessage::FlushWAL => self.handle_flush(),
        }
    }

    /// Handle initial message from proposer: check its sanity and send my
    /// current term.
-    async fn handle_greeting(
+    fn handle_greeting(
        &mut self,
        msg: &ProposerGreeting,
    ) -> Result<Option<AcceptorProposerMessage>> {
@@ -651,7 +649,7 @@ where
            if msg.pg_version != UNKNOWN_SERVER_VERSION {
                state.server.pg_version = msg.pg_version;
            }
-            self.state.persist(&state).await?;
+            self.state.persist(&state)?;
        }

        info!(
@@ -666,7 +664,7 @@ where
    }

    /// Give vote for the given term, if we haven't done that previously.
-    async fn handle_vote_request(
+    fn handle_vote_request(
        &mut self,
        msg: &VoteRequest,
    ) -> Result<Option<AcceptorProposerMessage>> {
@@ -680,7 +678,7 @@ where
        // handle_elected instead. Currently not a big deal, as proposer is the
        // only source of WAL; with peer2peer recovery it would be more
        // important.
-        self.wal_store.flush_wal().await?;
+        self.wal_store.flush_wal()?;
        // initialize with refusal
        let mut resp = VoteResponse {
            term: self.state.acceptor_state.term,
@@ -694,7 +692,7 @@ where
            let mut state = self.state.clone();
            state.acceptor_state.term = msg.term;
            // persist vote before sending it out
-            self.state.persist(&state).await?;
+            self.state.persist(&state)?;

            resp.term = self.state.acceptor_state.term;
            resp.vote_given = true as u64;
@@ -717,15 +715,12 @@ where
        ar
    }

-    async fn handle_elected(
-        &mut self,
-        msg: &ProposerElected,
-    ) -> Result<Option<AcceptorProposerMessage>> {
+    fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
        info!("received ProposerElected {:?}", msg);
        if self.state.acceptor_state.term < msg.term {
            let mut state = self.state.clone();
            state.acceptor_state.term = msg.term;
-            self.state.persist(&state).await?;
+            self.state.persist(&state)?;
        }

        // If our term is higher, ignore the message (next feedback will inform the compute)
@@ -755,7 +750,7 @@ where
        // intersection of our history and history from msg

        // truncate wal, update the LSNs
-        self.wal_store.truncate_wal(msg.start_streaming_at).await?;
+        self.wal_store.truncate_wal(msg.start_streaming_at)?;

        // and now adopt term history from proposer
        {
@@ -789,7 +784,7 @@ where
            self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);

            state.acceptor_state.term_history = msg.term_history.clone();
-            self.persist_control_file(state).await?;
+            self.persist_control_file(state)?;
        }

        info!("start receiving WAL since {:?}", msg.start_streaming_at);
@@ -801,7 +796,7 @@ where
    ///
    /// Note: it is assumed that 'WAL we have is from the right term' check has
    /// already been done outside.
-    async fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
+    fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
        // Both peers and walproposer communicate this value, we might already
        // have a fresher (higher) version.
        candidate = max(candidate, self.inmem.commit_lsn);
@@ -823,32 +818,29 @@ where
        // that we receive new epoch_start_lsn, and we still need to sync
        // control file in this case.
        if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
-            self.persist_control_file(self.state.clone()).await?;
+            self.persist_control_file(self.state.clone())?;
        }

        Ok(())
    }

    /// Persist control file to disk, called only after timeline creation (bootstrap).
-    pub async fn persist(&mut self) -> Result<()> {
-        self.persist_control_file(self.state.clone()).await
+    pub fn persist(&mut self) -> Result<()> {
+        self.persist_control_file(self.state.clone())
    }

    /// Persist in-memory state to the disk, taking other data from state.
-    async fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
+    fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
        state.commit_lsn = self.inmem.commit_lsn;
        state.backup_lsn = self.inmem.backup_lsn;
        state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
        state.proposer_uuid = self.inmem.proposer_uuid;
-        self.state.persist(&state).await
+        self.state.persist(&state)
    }

    /// Persist control file if there is something to save and enough time
    /// passed after the last save.
-    pub async fn maybe_persist_control_file(
-        &mut self,
-        inmem_remote_consistent_lsn: Lsn,
-    ) -> Result<()> {
+    pub fn maybe_persist_control_file(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
        if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
            return Ok(());
@@ -860,7 +852,7 @@ where
        if need_persist {
            let mut state = self.state.clone();
            state.remote_consistent_lsn = inmem_remote_consistent_lsn;
-            self.persist_control_file(state).await?;
+            self.persist_control_file(state)?;
            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
        }
        Ok(())
@@ -868,7 +860,7 @@ where

    /// Handle request to append WAL.
    #[allow(clippy::comparison_chain)]
-    async fn handle_append_request(
+    fn handle_append_request(
        &mut self,
        msg: &AppendRequest,
        require_flush: bool,
@@ -891,19 +883,17 @@ where

        // do the job
        if !msg.wal_data.is_empty() {
-            self.wal_store
-                .write_wal(msg.h.begin_lsn, &msg.wal_data)
-                .await?;
+            self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?;
        }

        // flush wal to the disk, if required
        if require_flush {
-            self.wal_store.flush_wal().await?;
+            self.wal_store.flush_wal()?;
        }

        // Update commit_lsn.
        if msg.h.commit_lsn != Lsn(0) {
-            self.update_commit_lsn(msg.h.commit_lsn).await?;
+            self.update_commit_lsn(msg.h.commit_lsn)?;
        }
        // Value calculated by walproposer can always lag:
        // - safekeepers can forget inmem value and send to proposer lower
@@ -919,7 +909,7 @@ where
        if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
            < self.inmem.peer_horizon_lsn
        {
-            self.persist_control_file(self.state.clone()).await?;
+            self.persist_control_file(self.state.clone())?;
        }

        trace!(
@@ -941,15 +931,15 @@ where
    }

    /// Flush WAL to disk. Return AppendResponse with latest LSNs.
-    async fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
-        self.wal_store.flush_wal().await?;
+    fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
+        self.wal_store.flush_wal()?;
        Ok(Some(AcceptorProposerMessage::AppendResponse(
            self.append_response(),
        )))
    }

    /// Update timeline state with peer safekeeper data.
-    pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
+    pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
        let mut sync_control_file = false;

        if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
@@ -957,7 +947,7 @@ where
            // commit_lsn if our history matches (is part of) history of advanced
            // commit_lsn provider.
            if sk_info.last_log_term == self.get_epoch() {
-                self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
+                self.update_commit_lsn(Lsn(sk_info.commit_lsn))?;
            }
        }

@@ -983,7 +973,7 @@ where
            // Note: we could make remote_consistent_lsn update in cf common by
            // storing Arc to walsenders in Safekeeper.
            state.remote_consistent_lsn = new_remote_consistent_lsn;
-            self.persist_control_file(state).await?;
+            self.persist_control_file(state)?;
        }
        Ok(())
    }
@@ -1007,7 +997,6 @@ where

 #[cfg(test)]
 mod tests {
-    use futures::future::BoxFuture;
    use postgres_ffi::WAL_SEGMENT_SIZE;

    use super::*;
@@ -1019,9 +1008,8 @@ mod tests {
        persisted_state: SafeKeeperState,
    }

-    #[async_trait::async_trait]
    impl control_file::Storage for InMemoryState {
-        async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
+        fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
            self.persisted_state = s.clone();
            Ok(())
        }
@@ -1051,28 +1039,27 @@ mod tests {
        lsn: Lsn,
    }

-    #[async_trait::async_trait]
    impl wal_storage::Storage for DummyWalStore {
        fn flush_lsn(&self) -> Lsn {
            self.lsn
        }

-        async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
+        fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
            self.lsn = startpos + buf.len() as u64;
            Ok(())
        }

-        async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+        fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
            self.lsn = end_pos;
            Ok(())
        }

-        async fn flush_wal(&mut self) -> Result<()> {
+        fn flush_wal(&mut self) -> Result<()> {
            Ok(())
        }

-        fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
-            Box::pin(async { Ok(()) })
+        fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
+            Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
        }

        fn get_metrics(&self) -> crate::metrics::WalStorageMetrics {
@@ -1080,8 +1067,8 @@ mod tests {
        }
    }

-    #[tokio::test]
-    async fn test_voting() {
+    #[test]
+    fn test_voting() {
        let storage = InMemoryState {
            persisted_state: test_sk_state(),
        };
@@ -1090,7 +1077,7 @@ mod tests {

        // check voting for 1 is ok
        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
-        let mut vote_resp = sk.process_msg(&vote_request).await;
+        let mut vote_resp = sk.process_msg(&vote_request);
        match vote_resp.unwrap() {
            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
            r => panic!("unexpected response: {:?}", r),
@@ -1105,15 +1092,15 @@ mod tests {
        sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap();

        // and ensure voting second time for 1 is not ok
-        vote_resp = sk.process_msg(&vote_request).await;
+        vote_resp = sk.process_msg(&vote_request);
        match vote_resp.unwrap() {
            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
            r => panic!("unexpected response: {:?}", r),
        }
    }

-    #[tokio::test]
-    async fn test_epoch_switch() {
+    #[test]
+    fn test_epoch_switch() {
        let storage = InMemoryState {
            persisted_state: test_sk_state(),
        };
@@ -1145,13 +1132,10 @@ mod tests {
            timeline_start_lsn: Lsn(0),
        };
        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
-            .await
            .unwrap();

        // check that AppendRequest before epochStartLsn doesn't switch epoch
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
+        let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
        assert!(resp.is_ok());
        assert_eq!(sk.get_epoch(), 0);

@@ -1162,11 +1146,9 @@ mod tests {
            h: ar_hdr,
            wal_data: Bytes::from_static(b"b"),
        };
-        let resp = sk
-            .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
-            .await;
+        let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
        assert!(resp.is_ok());
-        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
+        sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %)
        assert_eq!(sk.get_epoch(), 1);
    }
 }
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -396,7 +396,7 @@ impl SafekeeperPostgresHandler {
        // on this safekeeper itself. That's ok as (old) proposer will never be
        // able to commit such WAL.
        let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
-            let wal_end = tli.get_flush_lsn().await;
+            let wal_end = tli.get_flush_lsn();
            Some(wal_end)
        } else {
            None
@@ -418,7 +418,7 @@ impl SafekeeperPostgresHandler {
        // switch to copy
        pgb.write_message(&BeMessage::CopyBothResponse).await?;

-        let (_, persisted_state) = tli.get_state().await;
+        let (_, persisted_state) = tli.get_state();
        let wal_reader = WalReader::new(
            self.conf.workdir.clone(),
            self.conf.timeline_dir(&tli.ttid),
@@ -562,7 +562,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                .walsenders
                .get_ws_remote_consistent_lsn(self.ws_guard.id)
            {
-                if self.tli.should_walsender_stop(remote_consistent_lsn).await {
+                if self.tli.should_walsender_stop(remote_consistent_lsn) {
                    // Terminate if there is nothing more to send.
                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -2,13 +2,12 @@
 //! to glue together SafeKeeper and all other background services.

 use anyhow::{anyhow, bail, Result};
+use parking_lot::{Mutex, MutexGuard};
 use postgres_ffi::XLogSegNo;
-use tokio::fs;

 use std::cmp::max;
 use std::path::PathBuf;
 use std::sync::Arc;
-use tokio::sync::{Mutex, MutexGuard};
 use tokio::{
    sync::{mpsc::Sender, watch},
    time::Instant,
@@ -287,9 +286,8 @@ pub struct Timeline {
    commit_lsn_watch_tx: watch::Sender<Lsn>,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,

-    /// Safekeeper and other state, that should remain consistent and
-    /// synchronized with the disk. This is tokio mutex as we write WAL to disk
-    /// while holding it, ensuring that consensus checks are in order.
+    /// Safekeeper and other state, that should remain consistent and synchronized
+    /// with the disk.
    mutex: Mutex<SharedState>,
    walsenders: Arc<WalSenders>,

@@ -363,8 +361,8 @@ impl Timeline {
    ///
    /// Bootstrap is transactional, so if it fails, created files will be deleted,
    /// and state on disk should remain unchanged.
-    pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
-        match fs::metadata(&self.timeline_dir).await {
+    pub fn bootstrap(&self, shared_state: &mut MutexGuard<SharedState>) -> Result<()> {
+        match std::fs::metadata(&self.timeline_dir) {
            Ok(_) => {
                // Timeline directory exists on disk, we should leave state unchanged
                // and return error.
@@ -377,51 +375,53 @@ impl Timeline {
        }

        // Create timeline directory.
-        fs::create_dir_all(&self.timeline_dir).await?;
+        std::fs::create_dir_all(&self.timeline_dir)?;

        // Write timeline to disk and TODO: start background tasks.
-        if let Err(e) = shared_state.sk.persist().await {
-            // Bootstrap failed, cancel timeline and remove timeline directory.
-            self.cancel(shared_state);
+        match || -> Result<()> {
+            shared_state.sk.persist()?;
+            // TODO: add more initialization steps here
+            self.update_status(shared_state);
+            Ok(())
+        }() {
+            Ok(_) => Ok(()),
+            Err(e) => {
+                // Bootstrap failed, cancel timeline and remove timeline directory.
+                self.cancel(shared_state);

-            if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await {
-                warn!(
-                    "failed to remove timeline {} directory after bootstrap failure: {}",
-                    self.ttid, fs_err
-                );
+                if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) {
+                    warn!(
+                        "failed to remove timeline {} directory after bootstrap failure: {}",
+                        self.ttid, fs_err
+                    );
+                }
+
+                Err(e)
            }
-
-            return Err(e);
        }
-
-        // TODO: add more initialization steps here
-        self.update_status(shared_state);
-        Ok(())
    }

    /// Delete timeline from disk completely, by removing timeline directory. Background
    /// timeline activities will stop eventually.
-    pub async fn delete_from_disk(
+    pub fn delete_from_disk(
        &self,
-        shared_state: &mut MutexGuard<'_, SharedState>,
+        shared_state: &mut MutexGuard<SharedState>,
    ) -> Result<(bool, bool)> {
        let was_active = shared_state.active;
        self.cancel(shared_state);
-        let dir_existed = delete_dir(&self.timeline_dir).await?;
+        let dir_existed = delete_dir(&self.timeline_dir)?;
        Ok((dir_existed, was_active))
    }

    /// Cancel timeline to prevent further usage. Background tasks will stop
    /// eventually after receiving cancellation signal.
-    ///
-    /// Note that we can't notify backup launcher here while holding
-    /// shared_state lock, as this is a potential deadlock: caller is
-    /// responsible for that. Generally we should probably make WAL backup tasks
-    /// to shut down on their own, checking once in a while whether it is the
-    /// time.
-    fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
+    fn cancel(&self, shared_state: &mut MutexGuard<SharedState>) {
        info!("timeline {} is cancelled", self.ttid);
        let _ = self.cancellation_tx.send(true);
+        let res = self.wal_backup_launcher_tx.blocking_send(self.ttid);
+        if let Err(e) = res {
+            error!("Failed to send stop signal to wal_backup_launcher: {}", e);
+        }
        // Close associated FDs. Nobody will be able to touch timeline data once
        // it is cancelled, so WAL storage won't be opened again.
        shared_state.sk.wal_store.close();
@@ -433,8 +433,8 @@ impl Timeline {
    }

    /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
-        self.mutex.lock().await
+    pub fn write_shared_state(&self) -> MutexGuard<SharedState> {
+        self.mutex.lock()
    }

    fn update_status(&self, shared_state: &mut SharedState) -> bool {
@@ -450,7 +450,7 @@ impl Timeline {

        let is_wal_backup_action_pending: bool;
        {
-            let mut shared_state = self.write_shared_state().await;
+            let mut shared_state = self.write_shared_state();
            shared_state.num_computes += 1;
            is_wal_backup_action_pending = self.update_status(&mut shared_state);
        }
@@ -464,17 +464,22 @@ impl Timeline {

    /// De-register compute connection, shutting down timeline activity if
    /// pageserver doesn't need catchup.
-    pub async fn on_compute_disconnect(&self) -> Result<()> {
+    pub fn on_compute_disconnect(&self) -> Result<()> {
        let is_wal_backup_action_pending: bool;
        {
-            let mut shared_state = self.write_shared_state().await;
+            let mut shared_state = self.write_shared_state();
            shared_state.num_computes -= 1;
            is_wal_backup_action_pending = self.update_status(&mut shared_state);
        }
        // Wake up wal backup launcher, if it is time to stop the offloading.
        if is_wal_backup_action_pending {
            // Can fail only if channel to a static thread got closed, which is not normal at all.
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
+            //
+            // Note: this is blocking_send because on_compute_disconnect is called in Drop, there is
+            // no async Drop and we use current thread runtimes. With current thread rt spawning
+            // task in drop impl is racy, as thread along with runtime might finish before the task.
+            // This should be switched send.await when/if we go to full async.
+            self.wal_backup_launcher_tx.blocking_send(self.ttid)?;
        }
        Ok(())
    }
@@ -484,11 +489,11 @@ impl Timeline {
    /// computes. While there might be nothing to stream already, we learn about
    /// remote_consistent_lsn update through replication feedback, and we want
    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
+    pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
        if self.is_cancelled() {
            return true;
        }
-        let shared_state = self.write_shared_state().await;
+        let shared_state = self.write_shared_state();
        if shared_state.num_computes == 0 {
            return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
            reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
@@ -498,12 +503,12 @@ impl Timeline {

    /// Returns whether s3 offloading is required and sets current status as
    /// matching it.
-    pub async fn wal_backup_attend(&self) -> bool {
+    pub fn wal_backup_attend(&self) -> bool {
        if self.is_cancelled() {
            return false;
        }

-        self.write_shared_state().await.wal_backup_attend()
+        self.write_shared_state().wal_backup_attend()
    }

    /// Returns commit_lsn watch channel.
@@ -512,7 +517,7 @@ impl Timeline {
    }

    /// Pass arrived message to the safekeeper.
-    pub async fn process_msg(
+    pub fn process_msg(
        &self,
        msg: &ProposerAcceptorMessage,
    ) -> Result<Option<AcceptorProposerMessage>> {
@@ -523,8 +528,8 @@ impl Timeline {
        let mut rmsg: Option<AcceptorProposerMessage>;
        let commit_lsn: Lsn;
        {
-            let mut shared_state = self.write_shared_state().await;
-            rmsg = shared_state.sk.process_msg(msg).await?;
+            let mut shared_state = self.write_shared_state();
+            rmsg = shared_state.sk.process_msg(msg)?;

            // if this is AppendResponse, fill in proper pageserver and hot
            // standby feedback.
@@ -541,37 +546,37 @@ impl Timeline {
    }

    /// Returns wal_seg_size.
-    pub async fn get_wal_seg_size(&self) -> usize {
-        self.write_shared_state().await.get_wal_seg_size()
+    pub fn get_wal_seg_size(&self) -> usize {
+        self.write_shared_state().get_wal_seg_size()
    }

    /// Returns true only if the timeline is loaded and active.
-    pub async fn is_active(&self) -> bool {
+    pub fn is_active(&self) -> bool {
        if self.is_cancelled() {
            return false;
        }

-        self.write_shared_state().await.active
+        self.write_shared_state().active
    }

    /// Returns state of the timeline.
-    pub async fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
-        let state = self.write_shared_state().await;
+    pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
+        let state = self.write_shared_state();
        (state.sk.inmem.clone(), state.sk.state.clone())
    }

    /// Returns latest backup_lsn.
-    pub async fn get_wal_backup_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.inmem.backup_lsn
+    pub fn get_wal_backup_lsn(&self) -> Lsn {
+        self.write_shared_state().sk.inmem.backup_lsn
    }

    /// Sets backup_lsn to the given value.
-    pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
+    pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
        if self.is_cancelled() {
            bail!(TimelineError::Cancelled(self.ttid));
        }

-        let mut state = self.write_shared_state().await;
+        let mut state = self.write_shared_state();
        state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn);
        // we should check whether to shut down offloader, but this will be done
        // soon by peer communication anyway.
@@ -579,8 +584,8 @@ impl Timeline {
    }

    /// Get safekeeper info for broadcasting to broker and other peers.
-    pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
-        let shared_state = self.write_shared_state().await;
+    pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
+        let shared_state = self.write_shared_state();
        shared_state.get_safekeeper_info(
            &self.ttid,
            conf,
@@ -599,8 +604,8 @@ impl Timeline {
        let is_wal_backup_action_pending: bool;
        let commit_lsn: Lsn;
        {
-            let mut shared_state = self.write_shared_state().await;
-            shared_state.sk.record_safekeeper_info(&sk_info).await?;
+            let mut shared_state = self.write_shared_state();
+            shared_state.sk.record_safekeeper_info(&sk_info)?;
            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
            shared_state.peers_info.upsert(&peer_info);
            is_wal_backup_action_pending = self.update_status(&mut shared_state);
@@ -617,8 +622,8 @@ impl Timeline {
    /// Get our latest view of alive peers status on the timeline.
    /// We pass our own info through the broker as well, so when we don't have connection
    /// to the broker returned vec is empty.
-    pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
-        let shared_state = self.write_shared_state().await;
+    pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
+        let shared_state = self.write_shared_state();
        let now = Instant::now();
        shared_state
            .peers_info
@@ -635,34 +640,34 @@ impl Timeline {
    }

    /// Returns flush_lsn.
-    pub async fn get_flush_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.wal_store.flush_lsn()
+    pub fn get_flush_lsn(&self) -> Lsn {
+        self.write_shared_state().sk.wal_store.flush_lsn()
    }

    /// Delete WAL segments from disk that are no longer needed. This is determined
    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
+    pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
        if self.is_cancelled() {
            bail!(TimelineError::Cancelled(self.ttid));
        }

        let horizon_segno: XLogSegNo;
-        let remover = {
-            let shared_state = self.write_shared_state().await;
+        let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
+        {
+            let shared_state = self.write_shared_state();
            horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
+            remover = shared_state.sk.wal_store.remove_up_to();
            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
-                return Ok(()); // nothing to do
+                return Ok(());
            }
-            let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1);
            // release the lock before removing
-            remover
-        };
+        }

        // delete old WAL files
-        remover.await?;
+        remover(horizon_segno - 1)?;

        // update last_removed_segno
-        let mut shared_state = self.write_shared_state().await;
+        let mut shared_state = self.write_shared_state();
        shared_state.last_removed_segno = horizon_segno;
        Ok(())
    }
@@ -671,24 +676,22 @@ impl Timeline {
    /// passed after the last save. This helps to keep remote_consistent_lsn up
    /// to date so that storage nodes restart doesn't cause many pageserver ->
    /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(&self) -> Result<()> {
+    pub fn maybe_pesist_control_file(&self) -> Result<()> {
        let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn();
        self.write_shared_state()
-            .await
            .sk
            .maybe_persist_control_file(remote_consistent_lsn)
-            .await
    }

-    /// Gather timeline data for metrics. If the timeline is not active, returns
-    /// None, we do not collect these.
-    pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
+    /// Returns full timeline info, required for the metrics. If the timeline is
+    /// not active, returns None instead.
+    pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
        if self.is_cancelled() {
            return None;
        }

        let ps_feedback = self.walsenders.get_ps_feedback();
-        let state = self.write_shared_state().await;
+        let state = self.write_shared_state();
        if state.active {
            Some(FullTimelineInfo {
                ttid: self.ttid,
@@ -710,8 +713,8 @@ impl Timeline {
    }

    /// Returns in-memory timeline state to build a full debug dump.
-    pub async fn memory_dump(&self) -> debug_dump::Memory {
-        let state = self.write_shared_state().await;
+    pub fn memory_dump(&self) -> debug_dump::Memory {
+        let state = self.write_shared_state();

        let (write_lsn, write_record_lsn, flush_lsn, file_open) =
            state.sk.wal_store.internal_state();
@@ -735,8 +738,8 @@ impl Timeline {
 }

 /// Deletes directory and it's contents. Returns false if directory does not exist.
-async fn delete_dir(path: &PathBuf) -> Result<bool> {
-    match fs::remove_dir_all(path).await {
+fn delete_dir(path: &PathBuf) -> Result<bool> {
+    match std::fs::remove_dir_all(path) {
        Ok(_) => Ok(true),
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false),
        Err(e) => Err(e.into()),
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -113,17 +113,9 @@ impl GlobalTimelines {
        Ok(())
    }

-    /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
-    /// errors if any.
-    ///
-    /// Note: This function (and all reading/loading below) is sync because
-    /// timelines are loaded while holding GlobalTimelinesState lock. Which is
-    /// fine as this is called only from single threaded main runtime on boot,
-    /// but clippy complains anyway, and suppressing that isn't trivial as async
-    /// is the keyword, ha. That only other user is pull_timeline.rs for which
-    /// being blocked is not that bad, and we can do spawn_blocking.
+    /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any.
    fn load_tenant_timelines(
-        state: &mut MutexGuard<'_, GlobalTimelinesState>,
+        state: &mut MutexGuard<GlobalTimelinesState>,
        tenant_id: TenantId,
    ) -> Result<()> {
        let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
@@ -228,7 +220,7 @@ impl GlobalTimelines {
        // Take a lock and finish the initialization holding this mutex. No other threads
        // can interfere with creation after we will insert timeline into the map.
        {
-            let mut shared_state = timeline.write_shared_state().await;
+            let mut shared_state = timeline.write_shared_state();

            // We can get a race condition here in case of concurrent create calls, but only
            // in theory. create() will return valid timeline on the next try.
@@ -240,7 +232,7 @@ impl GlobalTimelines {
            // Write the new timeline to the disk and start background workers.
            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
            // and the state on disk should remain unchanged.
-            if let Err(e) = timeline.bootstrap(&mut shared_state).await {
+            if let Err(e) = timeline.bootstrap(&mut shared_state) {
                // Note: the most likely reason for bootstrap failure is that the timeline
                // directory already exists on disk. This happens when timeline is corrupted
                // and wasn't loaded from disk on startup because of that. We want to preserve
@@ -302,16 +294,15 @@ impl GlobalTimelines {
    }

    /// Cancels timeline, then deletes the corresponding data directory.
-    pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
+    pub fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
        let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
        match tli_res {
            Ok(timeline) => {
                // Take a lock and finish the deletion holding this mutex.
-                let mut shared_state = timeline.write_shared_state().await;
+                let mut shared_state = timeline.write_shared_state();

                info!("deleting timeline {}", ttid);
-                let (dir_existed, was_active) =
-                    timeline.delete_from_disk(&mut shared_state).await?;
+                let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?;

                // Remove timeline from the map.
                // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -344,7 +335,7 @@ impl GlobalTimelines {
    /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
    /// created simultaneously. In that case the function will return error and the caller should
    /// retry tenant deletion again later.
-    pub async fn delete_force_all_for_tenant(
+    pub fn delete_force_all_for_tenant(
        tenant_id: &TenantId,
    ) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
        info!("deleting all timelines for tenant {}", tenant_id);
@@ -354,7 +345,7 @@ impl GlobalTimelines {

        let mut deleted = HashMap::new();
        for tli in &to_delete {
-            match Self::delete_force(&tli.ttid).await {
+            match Self::delete_force(&tli.ttid) {
                Ok(result) => {
                    deleted.insert(tli.ttid, result);
                }
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -17,6 +17,7 @@ use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;
+use tokio::runtime::Builder;

 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
@@ -35,16 +36,30 @@ use once_cell::sync::OnceCell;
 const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
 const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;

+pub fn wal_backup_launcher_thread_main(
+    conf: SafeKeeperConf,
+    wal_backup_launcher_rx: Receiver<TenantTimelineId>,
+) {
+    let mut builder = Builder::new_multi_thread();
+    if let Some(num_threads) = conf.backup_runtime_threads {
+        builder.worker_threads(num_threads);
+    }
+    let rt = builder
+        .enable_all()
+        .build()
+        .expect("failed to create wal backup runtime");
+
+    rt.block_on(async {
+        wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await;
+    });
+}
+
 /// Check whether wal backup is required for timeline. If yes, mark that launcher is
 /// aware of current status and return the timeline.
-async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
-    match GlobalTimelines::get(ttid).ok() {
-        Some(tli) => {
-            tli.wal_backup_attend().await;
-            Some(tli)
-        }
-        None => None,
-    }
+fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
+    GlobalTimelines::get(ttid)
+        .ok()
+        .filter(|tli| tli.wal_backup_attend())
 }

 struct WalBackupTaskHandle {
@@ -128,8 +143,8 @@ async fn update_task(
    ttid: TenantTimelineId,
    entry: &mut WalBackupTimelineEntry,
 ) {
-    let alive_peers = entry.timeline.get_peers(conf).await;
-    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
+    let alive_peers = entry.timeline.get_peers(conf);
+    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn();
    let (offloader, election_dbg_str) =
        determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
    let elected_me = Some(conf.my_id) == offloader;
@@ -168,10 +183,10 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
 /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
 /// tasks. Having this in separate task simplifies locking, allows to reap
 /// panics and separate elections from offloading itself.
-pub async fn wal_backup_launcher_task_main(
+async fn wal_backup_launcher_main_loop(
    conf: SafeKeeperConf,
    mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
-) -> anyhow::Result<()> {
+) {
    info!(
        "WAL backup launcher started, remote config {:?}",
        conf.remote_storage
@@ -199,7 +214,7 @@ pub async fn wal_backup_launcher_task_main(
                if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
                    continue; /* just drain the channel and do nothing */
                }
-                let timeline = is_wal_backup_required(ttid).await;
+                let timeline = is_wal_backup_required(ttid);
                // do we need to do anything at all?
                if timeline.is_some() != tasks.contains_key(&ttid) {
                    if let Some(timeline) = timeline {
@@ -254,7 +269,7 @@ async fn backup_task_main(
    let tli = res.unwrap();

    let mut wb = WalBackupTask {
-        wal_seg_size: tli.get_wal_seg_size().await,
+        wal_seg_size: tli.get_wal_seg_size(),
        commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
        timeline: tli,
        timeline_dir,
@@ -311,7 +326,7 @@ impl WalBackupTask {
                continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
            }
            // Perhaps peers advanced the position, check shmem value.
-            backup_lsn = self.timeline.get_wal_backup_lsn().await;
+            backup_lsn = self.timeline.get_wal_backup_lsn();
            if backup_lsn.segment_number(self.wal_seg_size)
                >= commit_lsn.segment_number(self.wal_seg_size)
            {
@@ -387,7 +402,6 @@ pub async fn backup_lsn_range(
                let new_backup_lsn = segment.end_lsn;
                timeline
                    .set_wal_backup_lsn(new_backup_lsn)
-                    .await
                    .context("setting wal_backup_lsn")?;
                *backup_lsn = new_backup_lsn;
            } else {
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,7 +4,7 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
-use std::{future, time::Duration};
+use std::{future, thread, time::Duration};
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
 use tracing::*;
@@ -16,82 +16,104 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
-pub async fn task_main(
-    conf: SafeKeeperConf,
-    pg_listener: std::net::TcpListener,
-) -> anyhow::Result<()> {
-    // Tokio's from_std won't do this for us, per its comment.
-    pg_listener.set_nonblocking(true)?;
+pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .context("create runtime")
+        // todo catch error in main thread
+        .expect("failed to create runtime");

-    let listener = tokio::net::TcpListener::from_std(pg_listener)?;
-    let mut connection_count: ConnectionCount = 0;
+    runtime
+        .block_on(async move {
+            // Tokio's from_std won't do this for us, per its comment.
+            pg_listener.set_nonblocking(true)?;
+            let listener = tokio::net::TcpListener::from_std(pg_listener)?;
+            let mut connection_count: ConnectionCount = 0;

-    loop {
-        let (socket, peer_addr) = listener.accept().await.context("accept")?;
-        debug!("accepted connection from {}", peer_addr);
-        let conf = conf.clone();
-        let conn_id = issue_connection_id(&mut connection_count);
+            loop {
+                match listener.accept().await {
+                    Ok((socket, peer_addr)) => {
+                        debug!("accepted connection from {}", peer_addr);
+                        let conf = conf.clone();
+                        let conn_id = issue_connection_id(&mut connection_count);

-        tokio::spawn(async move {
-            if let Err(err) = handle_socket(socket, conf, conn_id)
-                .instrument(info_span!("", cid = %conn_id))
-                .await
-            {
-                error!("connection handler exited: {}", err);
+                        let _ = thread::Builder::new()
+                            .name("WAL service thread".into())
+                            .spawn(move || {
+                                if let Err(err) = handle_socket(socket, conf, conn_id) {
+                                    error!("connection handler exited: {}", err);
+                                }
+                            })
+                            .unwrap();
+                    }
+                    Err(e) => error!("Failed to accept connection: {}", e),
+                }
            }
-        });
-    }
+            #[allow(unreachable_code)] // hint compiler the closure return type
+            Ok::<(), anyhow::Error>(())
+        })
+        .expect("listener failed")
 }

-/// This is run by `task_main` above, inside a background thread.
+/// This is run by `thread_main` above, inside a background thread.
 ///
-async fn handle_socket(
+fn handle_socket(
    socket: TcpStream,
    conf: SafeKeeperConf,
    conn_id: ConnectionId,
 ) -> Result<(), QueryError> {
+    let _enter = info_span!("", cid = %conn_id).entered();
+
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
+
    socket.set_nodelay(true)?;
    let peer_addr = socket.peer_addr()?;

-    // Set timeout on reading from the socket. It prevents hanged up connection
-    // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
-    // default, and tokio doesn't provide ability to set it out of the box.
-    let mut socket = TimeoutReader::new(socket);
-    let wal_service_timeout = Duration::from_secs(60 * 10);
-    socket.set_timeout(Some(wal_service_timeout));
-    // pin! is here because TimeoutReader (due to storing sleep future inside)
-    // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
-    // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
-    // shouldn't be moved.
-    tokio::pin!(socket);
+    // TimeoutReader wants async runtime during creation.
+    runtime.block_on(async move {
+        // Set timeout on reading from the socket. It prevents hanged up connection
+        // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
+        // default, and tokio doesn't provide ability to set it out of the box.
+        let mut socket = TimeoutReader::new(socket);
+        let wal_service_timeout = Duration::from_secs(60 * 10);
+        socket.set_timeout(Some(wal_service_timeout));
+        // pin! is here because TimeoutReader (due to storing sleep future inside)
+        // is not Unpin, and all pgbackend/framed/tokio dependencies require stream
+        // to be Unpin. Which is reasonable, as indeed something like TimeoutReader
+        // shouldn't be moved.
+        tokio::pin!(socket);

-    let traffic_metrics = TrafficMetrics::new();
-    if let Some(current_az) = conf.availability_zone.as_deref() {
-        traffic_metrics.set_sk_az(current_az);
-    }
+        let traffic_metrics = TrafficMetrics::new();
+        if let Some(current_az) = conf.availability_zone.as_deref() {
+            traffic_metrics.set_sk_az(current_az);
+        }

-    let socket = MeasuredStream::new(
-        socket,
-        |cnt| {
-            traffic_metrics.observe_read(cnt);
-        },
-        |cnt| {
-            traffic_metrics.observe_write(cnt);
-        },
-    );
+        let socket = MeasuredStream::new(
+            socket,
+            |cnt| {
+                traffic_metrics.observe_read(cnt);
+            },
+            |cnt| {
+                traffic_metrics.observe_write(cnt);
+            },
+        );

-    let auth_type = match conf.auth {
-        None => AuthType::Trust,
-        Some(_) => AuthType::NeonJWT,
-    };
-    let mut conn_handler =
-        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
-    // libpq protocol between safekeeper and walproposer / pageserver
-    // We don't use shutdown.
-    pgbackend
-        .run(&mut conn_handler, future::pending::<()>)
-        .await
+        let auth_type = match conf.auth {
+            None => AuthType::Trust,
+            Some(_) => AuthType::NeonJWT,
+        };
+        let mut conn_handler =
+            SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
+        let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+        // libpq protocol between safekeeper and walproposer / pageserver
+        // We don't use shutdown.
+        pgbackend
+            .run(&mut conn_handler, future::pending::<()>)
+            .await
+    })
 }

 /// Unique WAL service connection ids are logged in spans for observability.
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -8,47 +8,54 @@
 //! Note that last file has `.partial` suffix, that's different from postgres.

 use anyhow::{bail, Context, Result};
-use bytes::Bytes;
-use futures::future::BoxFuture;
+use remote_storage::RemotePath;
+
+use std::io::{self, Seek, SeekFrom};
+use std::pin::Pin;
+use tokio::io::AsyncRead;
+
 use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::RemotePath;
 use std::cmp::{max, min};
-use std::io::{self, SeekFrom};
+
+use bytes::Bytes;
+use std::fs::{self, remove_file, File, OpenOptions};
+use std::io::Write;
 use std::path::{Path, PathBuf};
-use std::pin::Pin;
-use tokio::fs::{self, remove_file, File, OpenOptions};
-use tokio::io::{AsyncRead, AsyncWriteExt};
-use tokio::io::{AsyncReadExt, AsyncSeekExt};
+
 use tracing::*;

+use utils::{id::TenantTimelineId, lsn::Lsn};
+
 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::safekeeper::SafeKeeperState;
+
 use crate::wal_backup::read_object;
 use crate::SafeKeeperConf;
-use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::XLOG_BLCKSZ;
-use pq_proto::SystemId;
-use utils::{id::TenantTimelineId, lsn::Lsn};

-#[async_trait::async_trait]
+use postgres_ffi::waldecoder::WalStreamDecoder;
+
+use pq_proto::SystemId;
+use tokio::io::{AsyncReadExt, AsyncSeekExt};
+
 pub trait Storage {
    /// LSN of last durably stored WAL record.
    fn flush_lsn(&self) -> Lsn;

    /// Write piece of WAL from buf to disk, but not necessarily sync it.
-    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
+    fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;

    /// Truncate WAL at specified LSN, which must be the end of WAL record.
-    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
+    fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;

    /// Durably store WAL on disk, up to the last written WAL record.
-    async fn flush_wal(&mut self) -> Result<()>;
+    fn flush_wal(&mut self) -> Result<()>;

-    /// Remove all segments <= given segno. Returns function doing that as we
-    /// want to perform it without timeline lock.
-    fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>>;
+    /// Remove all segments <= given segno. Returns closure as we want to do
+    /// that without timeline lock.
+    fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>>;

    /// Release resources associated with the storage -- technically, close FDs.
    /// Currently we don't remove timelines until restart (#3146), so need to
@@ -98,22 +105,6 @@ pub struct PhysicalStorage {
    /// - points to write_lsn, so no seek is needed for writing
    /// - doesn't point to the end of the segment
    file: Option<File>,
-
-    /// When false, we have just initialized storage using the LSN from find_end_of_wal().
-    /// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
-    /// there can be a case with unexpected .partial file.
-    ///
-    /// Imagine the following:
-    /// - 000000010000000000000001
-    ///   - it was fully written, but the last record is split between 2 segments
-    ///   - after restart, find_end_of_wal() returned 0/1FFFFF0, which is in the end of this segment
-    ///   - write_lsn, write_record_lsn and flush_record_lsn were initialized to 0/1FFFFF0
-    /// - 000000010000000000000002.partial
-    ///   - it has only 1 byte written, which is not enough to make a full WAL record
-    ///
-    /// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
-    /// This flag will be set to true after the first truncate_wal() call.
-    is_truncated_after_restart: bool,
 }

 impl PhysicalStorage {
@@ -173,7 +164,6 @@ impl PhysicalStorage {
            flush_record_lsn: flush_lsn,
            decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
            file: None,
-            is_truncated_after_restart: false,
        })
    }

@@ -188,37 +178,33 @@ impl PhysicalStorage {
    }

    /// Call fdatasync if config requires so.
-    async fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
+    fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
        if !self.conf.no_sync {
            self.metrics
-                .observe_flush_seconds(time_io_closure(file.sync_data()).await?);
+                .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?);
        }
        Ok(())
    }

    /// Call fsync if config requires so.
-    async fn fsync_file(&mut self, file: &mut File) -> Result<()> {
+    fn fsync_file(&mut self, file: &mut File) -> Result<()> {
        if !self.conf.no_sync {
            self.metrics
-                .observe_flush_seconds(time_io_closure(file.sync_all()).await?);
+                .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?);
        }
        Ok(())
    }

    /// Open or create WAL segment file. Caller must call seek to the wanted position.
    /// Returns `file` and `is_partial`.
-    async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
+    fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
        let (wal_file_path, wal_file_partial_path) =
            wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;

        // Try to open already completed segment
-        if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await {
+        if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
            Ok((file, false))
-        } else if let Ok(file) = OpenOptions::new()
-            .write(true)
-            .open(&wal_file_partial_path)
-            .await
-        {
+        } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
            // Try to open existing partial file
            Ok((file, true))
        } else {
@@ -227,36 +213,35 @@ impl PhysicalStorage {
                .create(true)
                .write(true)
                .open(&wal_file_partial_path)
-                .await
                .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;

-            write_zeroes(&mut file, self.wal_seg_size).await?;
-            self.fsync_file(&mut file).await?;
+            write_zeroes(&mut file, self.wal_seg_size)?;
+            self.fsync_file(&mut file)?;
            Ok((file, true))
        }
    }

    /// Write WAL bytes, which are known to be located in a single WAL segment.
-    async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
+    fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
        let mut file = if let Some(file) = self.file.take() {
            file
        } else {
-            let (mut file, is_partial) = self.open_or_create(segno).await?;
+            let (mut file, is_partial) = self.open_or_create(segno)?;
            assert!(is_partial, "unexpected write into non-partial segment file");
-            file.seek(SeekFrom::Start(xlogoff as u64)).await?;
+            file.seek(SeekFrom::Start(xlogoff as u64))?;
            file
        };

-        file.write_all(buf).await?;
+        file.write_all(buf)?;

        if xlogoff + buf.len() == self.wal_seg_size {
            // If we reached the end of a WAL segment, flush and close it.
-            self.fdatasync_file(&mut file).await?;
+            self.fdatasync_file(&mut file)?;

            // Rename partial file to completed file
            let (wal_file_path, wal_file_partial_path) =
                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(wal_file_partial_path, wal_file_path).await?;
+            fs::rename(wal_file_partial_path, wal_file_path)?;
        } else {
            // otherwise, file can be reused later
            self.file = Some(file);
@@ -270,11 +255,11 @@ impl PhysicalStorage {
    /// be flushed separately later.
    ///
    /// Updates `write_lsn`.
-    async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
+    fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
        if self.write_lsn != pos {
            // need to flush the file before discarding it
            if let Some(mut file) = self.file.take() {
-                self.fdatasync_file(&mut file).await?;
+                self.fdatasync_file(&mut file)?;
            }

            self.write_lsn = pos;
@@ -292,8 +277,7 @@ impl PhysicalStorage {
                buf.len()
            };

-            self.write_in_segment(segno, xlogoff, &buf[..bytes_write])
-                .await?;
+            self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?;
            self.write_lsn += bytes_write as u64;
            buf = &buf[bytes_write..];
        }
@@ -302,7 +286,6 @@ impl PhysicalStorage {
    }
 }

-#[async_trait::async_trait]
 impl Storage for PhysicalStorage {
    /// flush_lsn returns LSN of last durably stored WAL record.
    fn flush_lsn(&self) -> Lsn {
@@ -310,7 +293,7 @@ impl Storage for PhysicalStorage {
    }

    /// Write WAL to disk.
-    async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
+    fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
        // Disallow any non-sequential writes, which can result in gaps or overwrites.
        // If we need to move the pointer, use truncate_wal() instead.
        if self.write_lsn > startpos {
@@ -328,7 +311,7 @@ impl Storage for PhysicalStorage {
            );
        }

-        let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
+        let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?;
        // WAL is written, updating write metrics
        self.metrics.observe_write_seconds(write_seconds);
        self.metrics.observe_write_bytes(buf.len());
@@ -357,14 +340,14 @@ impl Storage for PhysicalStorage {
        Ok(())
    }

-    async fn flush_wal(&mut self) -> Result<()> {
+    fn flush_wal(&mut self) -> Result<()> {
        if self.flush_record_lsn == self.write_record_lsn {
            // no need to do extra flush
            return Ok(());
        }

        if let Some(mut unflushed_file) = self.file.take() {
-            self.fdatasync_file(&mut unflushed_file).await?;
+            self.fdatasync_file(&mut unflushed_file)?;
            self.file = Some(unflushed_file);
        } else {
            // We have unflushed data (write_lsn != flush_lsn), but no file.
@@ -386,7 +369,7 @@ impl Storage for PhysicalStorage {

    /// Truncate written WAL by removing all WAL segments after the given LSN.
    /// end_pos must point to the end of the WAL record.
-    async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+    fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
        // Streaming must not create a hole, so truncate cannot be called on non-written lsn
        if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
            bail!(
@@ -398,51 +381,47 @@ impl Storage for PhysicalStorage {

        // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
        // disk (this happens on each connect).
-        if self.is_truncated_after_restart
-            && end_pos == self.write_lsn
-            && end_pos == self.flush_record_lsn
-        {
+        if end_pos == self.write_lsn {
            return Ok(());
        }

        // Close previously opened file, if any
        if let Some(mut unflushed_file) = self.file.take() {
-            self.fdatasync_file(&mut unflushed_file).await?;
+            self.fdatasync_file(&mut unflushed_file)?;
        }

        let xlogoff = end_pos.segment_offset(self.wal_seg_size);
        let segno = end_pos.segment_number(self.wal_seg_size);

        // Remove all segments after the given LSN.
-        remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;
+        remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?;

-        let (mut file, is_partial) = self.open_or_create(segno).await?;
+        let (mut file, is_partial) = self.open_or_create(segno)?;

        // Fill end with zeroes
-        file.seek(SeekFrom::Start(xlogoff as u64)).await?;
-        write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
-        self.fdatasync_file(&mut file).await?;
+        file.seek(SeekFrom::Start(xlogoff as u64))?;
+        write_zeroes(&mut file, self.wal_seg_size - xlogoff)?;
+        self.fdatasync_file(&mut file)?;

        if !is_partial {
            // Make segment partial once again
            let (wal_file_path, wal_file_partial_path) =
                wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(wal_file_path, wal_file_partial_path).await?;
+            fs::rename(wal_file_path, wal_file_partial_path)?;
        }

        // Update LSNs
        self.write_lsn = end_pos;
        self.write_record_lsn = end_pos;
        self.flush_record_lsn = end_pos;
-        self.is_truncated_after_restart = true;
        Ok(())
    }

-    fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
+    fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
        let timeline_dir = self.timeline_dir.clone();
        let wal_seg_size = self.wal_seg_size;
-        Box::pin(async move {
-            remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to).await
+        Box::new(move |segno_up_to: XLogSegNo| {
+            remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to)
        })
    }

@@ -457,7 +436,7 @@ impl Storage for PhysicalStorage {
 }

 /// Remove all WAL segments in timeline_dir that match the given predicate.
-async fn remove_segments_from_disk(
+fn remove_segments_from_disk(
    timeline_dir: &Path,
    wal_seg_size: usize,
    remove_predicate: impl Fn(XLogSegNo) -> bool,
@@ -466,8 +445,8 @@ async fn remove_segments_from_disk(
    let mut min_removed = u64::MAX;
    let mut max_removed = u64::MIN;

-    let mut entries = fs::read_dir(timeline_dir).await?;
-    while let Some(entry) = entries.next_entry().await? {
+    for entry in fs::read_dir(timeline_dir)? {
+        let entry = entry?;
        let entry_path = entry.path();
        let fname = entry_path.file_name().unwrap();

@@ -478,7 +457,7 @@ async fn remove_segments_from_disk(
            }
            let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
            if remove_predicate(segno) {
-                remove_file(entry_path).await?;
+                remove_file(entry_path)?;
                n_removed += 1;
                min_removed = min(min_removed, segno);
                max_removed = max(max_removed, segno);
@@ -710,12 +689,12 @@ impl WalReader {
 const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];

 /// Helper for filling file with zeroes.
-async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
+fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
    while count >= XLOG_BLCKSZ {
-        file.write_all(ZERO_BLOCK).await?;
+        file.write_all(ZERO_BLOCK)?;
        count -= XLOG_BLCKSZ;
    }
-    file.write_all(&ZERO_BLOCK[0..count]).await?;
+    file.write_all(&ZERO_BLOCK[0..count])?;
    Ok(())
 }

--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -32,7 +32,6 @@ pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");

 pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
-pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);

 // BrokerServiceClient charged with tonic provided Channel transport; helps to
 // avoid depending on tonic directly in user crates.
@@ -59,8 +58,7 @@ where
    }
    tonic_endpoint = tonic_endpoint
        .http2_keep_alive_interval(keepalive_interval)
-        .keep_alive_while_idle(true)
-        .connect_timeout(DEFAULT_CONNECT_TIMEOUT);
+        .keep_alive_while_idle(true);
    //  keep_alive_timeout is 20s by default on both client and server side
    let channel = tonic_endpoint.connect_lazy();
    Ok(BrokerClientChannel::new(channel))
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2912,6 +2912,7 @@ SKIP_FILES = frozenset(
        "pg_internal.init",
        "pg.log",
        "zenith.signal",
+        "neon_compute_spec_id.txt",
        "pg_hba.conf",
        "postgresql.conf",
        "postmaster.opts",
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -163,6 +163,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
                assert conn.get_parameter_status(name) == value


+@pytest.mark.timeout(5)
 def test_close_on_connections_exit(static_proxy: NeonProxy):
    # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
    # until after connections close.
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
Author	SHA1	Message	Date
Bojan Serafimov	aa9baddd3d	Fix unrelated test	2023-06-09 10:27:54 -04:00
Bojan Serafimov	4756dcd0cc	fmt	2023-06-09 09:57:06 -04:00
Bojan Serafimov	ab23e28768	revert test changes	2023-06-09 09:50:05 -04:00
Bojan Serafimov	341563261a	Store compute spec id inside basebackup	2023-06-09 00:39:06 -04:00
Bojan Serafimov	b836013721	Cleanup pg_stat_statements	2023-06-08 22:52:52 -04:00
Bojan Serafimov	44ad006eb3	Merge branch 'main' into startup-no-config	2023-06-08 18:12:15 -04:00
Bojan Serafimov	aff94b54c8	more roles, dbs	2023-06-08 12:47:59 -04:00
Bojan Serafimov	1adb38bb82	Merge branch 'new-startup-test' into startup-no-config	2023-06-08 12:41:14 -04:00
Bojan Serafimov	1baecdc27a	comments	2023-06-08 12:33:19 -04:00
Bojan Serafimov	eceda63379	Do two iterations	2023-06-08 12:30:47 -04:00
Bojan Serafimov	881bfc4da8	WIP	2023-06-08 10:21:07 -04:00
Bojan Serafimov	eda4f86588	Add startup test	2023-06-06 16:45:16 -04:00