Skip sync safekeepers if possible

RemoteTimelineClient::delete_all() to use s3::delete_objects (#4461 )
## Problem [#4325](https://github.com/neondatabase/neon/issues/4325) ## Summary of changes Use delete_objects() method
2026-03-13 05:10:37 +00:00 · 2023-06-27 10:35:25 -04:00 · 2023-06-27 15:01:32 +03:00 · 2023-06-27 13:56:32 +03:00 · 2023-06-27 10:55:03 +01:00 · 2023-06-27 10:57:28 +03:00
156 changed files with 8233 additions and 3783 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -180,7 +180,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    timeout-minutes: 360 # 6h
+    # Increase timeout to 8h, default timeout is 6h
    timeout-minutes: 480
    steps:
    - uses: actions/checkout@v3
@@ -321,8 +322,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    timeout-minutes: 360 # 6h
    steps:
    - uses: actions/checkout@v3
@@ -414,8 +413,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    timeout-minutes: 360 # 6h
    steps:
    - uses: actions/checkout@v3
@@ -501,8 +498,6 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    timeout-minutes: 360 # 6h
    steps:
    - uses: actions/checkout@v3
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -264,7 +264,7 @@ jobs:
          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
      - name: Install rust binaries
        run: |
@@ -623,51 +623,6 @@ jobs:
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
  neon-image-depot:
    # For testing this will run side-by-side for a few merges.
    # This action is not really optimized yet, but gets the job done
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    permissions:
      contents: read
      id-token: write
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 0
      - name: Setup go
        uses: actions/setup-go@v3
        with:
          go-version: '1.19'
      - name: Set up Depot CLI
        uses: depot/setup-action@v1
      - name: Install Crane & ECR helper
        run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
      - name: Configure ECR login
        run: |
          mkdir /github/home/.docker/
          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
      - name: Build and push
        uses: depot/build-push-action@v1
        with:
          # if no depot.json file is at the root of your repo, you must specify the project id
          project: nrdv0s4kcs
          push: true
          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
          build-args: |
            GIT_VERSION=${{ github.sha }}
            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
@@ -704,6 +659,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -761,6 +717,7 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
@@ -781,7 +738,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.8.0
+      VM_BUILDER_VERSION: v0.11.0
    steps:
      - name: Checkout
@@ -959,6 +916,20 @@ jobs:
            exit 1
          fi
      - name: Create git tag
        if: github.ref_name == 'release'
        uses: actions/github-script@v6
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
            github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })
  promote-compatibility-data:
    runs-on: [ self-hosted, gen3, small ]
    container:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,6 +3,7 @@ name: Create Release Branch
 on:
  schedule:
    - cron: '0 10 * * 2'
  workflow_dispatch:
 jobs:
  create_release_branch:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2349,9 +2349,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 [[package]]
 name = "openssl"
-version = "0.10.52"
+version = "0.10.55"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
 dependencies = [
 "bitflags",
 "cfg-if",
@@ -2381,9 +2381,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 [[package]]
 name = "openssl-sys"
-version = "0.9.87"
+version = "0.9.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
 dependencies = [
 "cc",
 "libc",
@@ -2770,7 +2770,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2783,7 +2783,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2794,7 +2794,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2812,7 +2812,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2874,7 +2874,6 @@ dependencies = [
 "serde",
 "thiserror",
 "utils",
 "wal_craft",
 "workspace_hack",
 ]
@@ -4273,7 +4272,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4894,7 +4893,9 @@ dependencies = [
 "once_cell",
 "postgres",
 "postgres_ffi",
 "regex",
 "tempfile",
 "utils",
 "workspace_hack",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,20 @@ members = [
    "storage_broker",
    "workspace_hack",
    "trace",
-    "libs/*",
+    "libs/compute_api",
    "libs/pageserver_api",
    "libs/postgres_ffi",
    "libs/safekeeper_api",
    "libs/utils",
    "libs/consumption_metrics",
    "libs/postgres_backend",
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
 ]
 [workspace.package]
@@ -127,11 +140,11 @@ env_logger = "0.10"
 log = "0.4"
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 ## Other git libraries
@@ -167,7 +180,7 @@ tonic-build = "0.9"
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -2,6 +2,7 @@ ARG PG_VERSION
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 #########################################################################################
 #
@@ -67,7 +68,7 @@ RUN apt update && \
 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
-    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /
@@ -95,7 +96,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -355,7 +356,7 @@ RUN apt-get update && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
-    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
+    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -410,7 +411,7 @@ RUN apt-get update && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
    mkdir build && \
    cd build && \
-    cmake .. && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -432,6 +433,88 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
 #########################################################################################
 #
 # Layer "rdkit-pg-build"
 # compile rdkit extension
 #
 #########################################################################################
 FROM build-deps AS rdkit-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt-get update && \
    apt-get install -y \
        cmake \
        libboost-iostreams1.74-dev \
        libboost-regex1.74-dev \
        libboost-serialization1.74-dev \
        libboost-system1.74-dev \
        libeigen3-dev \
        libfreetype6-dev
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
        -D RDK_BUILD_INCHI_SUPPORT=ON \
        -D RDK_BUILD_AVALON_SUPPORT=ON \
        -D RDK_BUILD_PYTHON_WRAPPERS=OFF \
        -D RDK_BUILD_DESCRIPTORS3D=OFF \
        -D RDK_BUILD_FREESASA_SUPPORT=OFF \
        -D RDK_BUILD_COORDGEN_SUPPORT=ON \
        -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
        -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
        -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
        -D RDK_USE_URF=OFF \
        -D RDK_BUILD_PGSQL=ON \
        -D RDK_PGSQL_STATIC=ON \
        -D PostgreSQL_CONFIG=pg_config \
        -D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
        -D RDK_INSTALL_INTREE=OFF \
        -D CMAKE_BUILD_TYPE=Release \
        . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
 #########################################################################################
 #
 # Layer "pg-uuidv7-pg-build"
 # compile pg_uuidv7 extension
 #
 #########################################################################################
 FROM build-deps AS pg-uuidv7-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
 #########################################################################################
 #
 # Layer "pg-roaringbitmap-pg-build"
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
 FROM build-deps AS pg-roaringbitmap-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -517,6 +600,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 #########################################################################################
 #
 # Layer "pg-pgx-ulid-build"
 # Compile "pgx_ulid" extension
 #
 #########################################################################################
 FROM rust-extensions-build AS pg-pgx-ulid-build
 RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -547,6 +646,10 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -556,6 +659,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
        -s install && \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/hnsw \
        -s install
 #########################################################################################
@@ -564,6 +671,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 ARG BUILD_TAG
 ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
@@ -616,14 +726,19 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
        locales \
        libicu67 \
        liblz4-1 \
        libreadline8 \
        libboost-iostreams1.74.0 \
        libboost-regex1.74.0 \
        libboost-serialization1.74.0 \
        libboost-system1.74.0 \
        libossp-uuid16 \
        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -633,7 +748,9 @@ RUN apt update &&  \
        libxslt1.1 \
        libzstd1 \
        libcurl4-openssl-dev \
-        procps && \
+        locales \
        procps \
        zlib1g && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -3,6 +3,7 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG
 FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
 WORKDIR /home/nonroot
@@ -16,6 +17,8 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
 ARG CACHEPOT_BUCKET=neon-github-dev
 #ARG AWS_ACCESS_KEY_ID
 #ARG AWS_SECRET_ACCESS_KEY
 ARG BUILD_TAG
 ENV BUILD_TAG=$BUILD_TAG
 COPY . .
--- a/8
+++ b/8
@@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
 	+@echo "Compiling hnsw $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -153,6 +158,9 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
 .PHONY: neon-pg-ext
 neon-pg-ext: \
--- a/README.md
+++ b/README.md
@@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
+libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
 libcurl4-openssl-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel
+  protobuf-devel libcurl-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf
+postgresql-libs cmake postgresql protobuf curl
 ```
 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -54,11 +54,20 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 const BUILD_TAG_DEFAULT: &str = "local";
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
    info!("build_tag: {build_tag}");
    let matches = cli().get_matches();
    let http_port = *matches
        .get_one::<u16>("http-port")
        .expect("http-port is required");
    let pgdata = matches
        .get_one::<String>("pgdata")
        .expect("PGDATA path is required");
@@ -178,7 +187,8 @@ fn main() -> Result<()> {
    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
-    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
    if !spec_set {
        // No spec provided, hang waiting for it.
@@ -286,6 +296,14 @@ fn cli() -> clap::Command {
    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
    clap::Command::new("compute_ctl")
        .version(version)
        .arg(
            Arg::new("http-port")
                .long("http-port")
                .value_name("HTTP_PORT")
                .default_value("3080")
                .value_parser(clap::value_parser!(u16))
                .required(false),
        )
        .arg(
            Arg::new("connstr")
                .short('C')
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,19 +1,3 @@
 //
 // XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
 // but there are several things that makes `PostgresNode` usage inconvenient in the
 // cloud:
 // - it inherits from `LocalEnv`, which contains **all-all** the information about
 //   a complete service running
 // - it uses `PageServerNode` with information about http endpoint, which we do not
 //   need in the cloud again
 // - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
 //
 // Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
 // attributes (not required for the cloud). Yet, it is still tempting to unify these
 // `PostgresNode` and `ComputeNode` and use one in both places.
 //
 // TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
 //
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
@@ -106,26 +90,38 @@ pub struct ParsedSpec {
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
        // Extract the options from the spec file that are needed to connect to
        // the storage system.
        //
        // For backwards-compatibility, the top-level fields in the spec file
        // may be empty. In that case, we need to dig them from the GUCs in the
        // cluster.settings field.
        let pageserver_connstr = spec
-            .cluster
+            .pageserver_connstring
-            .settings
+            .clone()
-            .find("neon.pageserver_connstring")
+            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
        let storage_auth_token = spec.storage_auth_token.clone();
-        let tenant_id: TenantId = spec
+        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
-            .cluster
+            tenant_id
-            .settings
+        } else {
-            .find("neon.tenant_id")
+            spec.cluster
-            .ok_or("tenant id should be provided")
+                .settings
-            .map(|s| TenantId::from_str(&s))?
+                .find("neon.tenant_id")
-            .or(Err("invalid tenant id"))?;
+                .ok_or("tenant id should be provided")
-        let timeline_id: TimelineId = spec
+                .map(|s| TenantId::from_str(&s))?
-            .cluster
+                .or(Err("invalid tenant id"))?
-            .settings
+        };
-            .find("neon.timeline_id")
+        let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
-            .ok_or("timeline id should be provided")
+            timeline_id
-            .map(|s| TimelineId::from_str(&s))?
+        } else {
-            .or(Err("invalid timeline id"))?;
+            spec.cluster
                .settings
                .find("neon.timeline_id")
                .ok_or("timeline id should be provided")
                .map(|s| TimelineId::from_str(&s))?
                .or(Err("invalid timeline id"))?
        };
        Ok(ParsedSpec {
            spec,
@@ -137,6 +133,84 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    let roles = spec
        .cluster
        .roles
        .iter()
        .map(|r| format!("'{}'", escape_literal(&r.name)))
        .collect::<Vec<_>>();
    let dbs = spec
        .cluster
        .databases
        .iter()
        .map(|db| format!("'{}'", escape_literal(&db.name)))
        .collect::<Vec<_>>();
    let roles_decl = if roles.is_empty() {
        String::from("roles text[] := NULL;")
    } else {
        format!(
            r#"
               roles text[] := ARRAY(SELECT rolname
                                     FROM pg_catalog.pg_roles
                                     WHERE rolname IN ({}));"#,
            roles.join(", ")
        )
    };
    let database_decl = if dbs.is_empty() {
        String::from("dbs text[] := NULL;")
    } else {
        format!(
            r#"
               dbs text[] := ARRAY(SELECT datname
                                   FROM pg_catalog.pg_database
                                   WHERE datname IN ({}));"#,
            dbs.join(", ")
        )
    };
    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
    let query = format!(
        r#"
            DO $$
                DECLARE
                    r text;
                    {}
                    {}
                BEGIN
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
                            FOREACH r IN ARRAY roles LOOP
                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
                            END LOOP;
                        END IF;
                        IF array_length(dbs, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
                        END IF;
                    END IF;
                END
            $$;"#,
        roles_decl, database_decl,
    );
    info!("Neon superuser created:\n{}", &query);
    client
        .simple_query(&query)
        .map_err(|e| anyhow::anyhow!(e).context(query))?;
    Ok(())
 }
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
@@ -264,9 +338,13 @@ impl ComputeNode {
        let lsn = match spec.mode {
            ComputeMode::Primary => {
                info!("starting safekeepers syncing");
-                let lsn = self
+                let lsn = if let Some(synced_lsn) = spec.skip_sync_safekeepers {
-                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    info!("no need to sync");
-                    .with_context(|| "failed to sync safekeepers")?;
+                    synced_lsn
                } else {
                    self.sync_safekeepers(pspec.storage_auth_token.clone())
                        .with_context(|| "failed to sync safekeepers")?
                };
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
@@ -295,8 +373,8 @@ impl ComputeNode {
        update_pg_hba(pgdata_path)?;
        match spec.mode {
-            ComputeMode::Primary | ComputeMode::Static(..) => {}
+            ComputeMode::Primary => {}
-            ComputeMode::Replica => {
+            ComputeMode::Replica | ComputeMode::Static(..) => {
                add_standby_signal(pgdata_path)?;
            }
        }
@@ -351,6 +429,8 @@ impl ComputeNode {
                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
                // Disable forwarding so that users don't get a cloud_admin role
                client.simple_query("SET neon.forward_ddl = false")?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);
@@ -361,24 +441,21 @@ impl ComputeNode {
            Ok(client) => client,
        };
        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        client.simple_query("SET neon.forward_ddl = false")?;
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        create_neon_superuser(spec, &mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        // 'Close' connection
        drop(client);
        info!(
            "finished configuration of compute for project {}",
            spec.cluster.cluster_id
        );
        Ok(())
    }
@@ -411,7 +488,7 @@ impl ComputeNode {
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
+            handle_grants(&spec, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }
@@ -431,22 +508,22 @@ impl ComputeNode {
    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
-        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id,
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            spec.tenant_id,
+            pspec.tenant_id,
-            spec.timeline_id,
+            pspec.timeline_id,
        );
        self.prepare_pgdata(&compute_state)?;
        let start_time = Utc::now();
-        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
-        if spec.spec.mode == ComputeMode::Primary {
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
            self.apply_config(&compute_state)?;
        }
@@ -466,6 +543,11 @@ impl ComputeNode {
        }
        self.set_status(ComputeStatus::Running);
        info!(
            "finished configuration of compute for project {}",
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
        );
        Ok(pg)
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -5,6 +5,7 @@ use std::path::Path;
 use anyhow::Result;
 use crate::pg_helpers::escape_conf_value;
 use crate::pg_helpers::PgOptionsSerialize;
 use compute_api::spec::{ComputeMode, ComputeSpec};
@@ -36,10 +37,44 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;
-    writeln!(file, "# Managed by compute_ctl: begin")?;
+    // Write the postgresql.conf content from the spec file as is.
    if let Some(conf) = &spec.cluster.postgresql_conf {
        writeln!(file, "{}", conf)?;
    }
    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
        writeln!(
            file,
            "neon.pageserver_connstring='{}'",
            escape_conf_value(s)
        )?;
    }
    if !spec.safekeeper_connstrings.is_empty() {
        writeln!(
            file,
            "neon.safekeepers='{}'",
            escape_conf_value(&spec.safekeeper_connstrings.join(","))
        )?;
    }
    if let Some(s) = &spec.tenant_id {
        writeln!(
            file,
            "neon.tenant_id='{}'",
            escape_conf_value(&s.to_string())
        )?;
    }
    if let Some(s) = &spec.timeline_id {
        writeln!(
            file,
            "neon.timeline_id='{}'",
            escape_conf_value(&s.to_string())
        )?;
    }
    match spec.mode {
        ComputeMode::Primary => {}
        ComputeMode::Static(lsn) => {
@@ -53,7 +88,12 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        }
    }
-    writeln!(file, "# Managed by compute_ctl: end")?;
+    // If there are any extra options in the 'settings' field, append those
    if spec.cluster.settings.is_some() {
        writeln!(file, "# Managed by compute_ctl: begin")?;
        write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
        writeln!(file, "# Managed by compute_ctl: end")?;
    }
    Ok(())
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
-async fn serve(state: Arc<ComputeNode>) {
+async fn serve(port: u16, state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
+    let addr = SocketAddr::from(([0, 0, 0, 0], port));
    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
@@ -256,10 +256,10 @@ async fn serve(state: Arc<ComputeNode>) {
 }
 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
    let state = Arc::clone(state);
    Ok(thread::Builder::new()
        .name("http-endpoint".into())
-        .spawn(move || serve(state))?)
+        .spawn(move || serve(port, state))?)
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -17,13 +17,13 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 /// Escape a string for including it in a SQL literal
-fn escape_literal(s: &str) -> String {
+pub fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }
 /// Escape a string so that it can be used in postgresql.conf.
 /// Same as escape_literal, currently.
-fn escape_conf_value(s: &str) -> String {
+pub fn escape_conf_value(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -269,17 +269,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
-                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
+                let mut query: String = format!(
                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
                let grant_query = format!(
                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
                    name.pg_quote()
                );
                xact.execute(grant_query.as_str(), &[])?;
                info!("role grant query: '{}'", &grant_query);
            }
        }
@@ -476,6 +472,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                query.push_str(&db.to_pg_options());
                let _guard = info_span!("executing", query).entered();
                client.execute(query.as_str(), &[])?;
                let grant_query: String = format!(
                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
                    name.pg_quote()
                );
                client.execute(grant_query.as_str(), &[])?;
            }
        };
@@ -495,35 +496,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
    info!("cluster spec grants:");
    // We now have a separate `web_access` role to connect to the database
    // via the web interface and proxy link auth. And also we grant a
    // read / write all data privilege to every role. So also grant
    // create to everyone.
    // XXX: later we should stop messing with Postgres ACL in such horrible
    // ways.
    let roles = spec
        .cluster
        .roles
        .iter()
        .map(|r| r.name.pg_quote())
        .collect::<Vec<_>>();
    for db in &spec.cluster.databases {
        let dbname = &db.name;
        let query: String = format!(
            "GRANT CREATE ON DATABASE {} TO {}",
            dbname.pg_quote(),
            roles.join(", ")
        );
        info!("grant query {}", &query);
        client.execute(query.as_str(), &[])?;
    }
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            println!("Creating endpoint for imported timeline ...");
            cplane.new_endpoint(
                tenant_id,
                name,
                tenant_id,
                timeline_id,
                None,
                None,
                pg_version,
                ComputeMode::Primary,
            )?;
@@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                table.add_row([
                    endpoint_id.as_str(),
-                    &endpoint.address.to_string(),
+                    &endpoint.pg_address.to_string(),
                    &endpoint.timeline_id.to_string(),
                    branch_name,
                    lsn_str.as_str(),
@@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_branch_timeline_id(branch_name, tenant_id)
                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-
+            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let pg_version = sub_args
                .get_one::<u32>("pg-version")
                .copied()
@@ -639,14 +640,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };
-            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
+            cplane.new_endpoint(
                &endpoint_id,
                tenant_id,
                timeline_id,
                pg_port,
                http_port,
                pg_version,
                mode,
            )?;
        }
        "start" => {
-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
                    let mut safekeepers: Vec<NodeId> = Vec::new();
                    for sk_id in safekeepers_str.split(',').map(str::trim) {
                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
                        })?);
                        safekeepers.push(sk_id);
                    }
                    safekeepers
                } else {
                    env.safekeepers.iter().map(|sk| sk.id).collect()
                };
            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -673,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token)?;
+                endpoint.start(&auth_token, safekeepers)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -709,14 +734,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
                let ep = cplane.new_endpoint(
                    tenant_id,
                    endpoint_id,
                    tenant_id,
                    timeline_id,
-                    port,
+                    pg_port,
                    http_port,
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token)?;
+                ep.start(&auth_token, safekeepers)?;
            }
        }
        "stop" => {
@@ -944,11 +970,22 @@ fn cli() -> Command {
        .value_parser(value_parser!(u32))
        .default_value(DEFAULT_PG_VERSION);
-    let port_arg = Arg::new("port")
+    let pg_port_arg = Arg::new("pg-port")
-        .long("port")
+        .long("pg-port")
        .required(false)
        .value_parser(value_parser!(u16))
-        .value_name("port");
+        .value_name("pg-port");
    let http_port_arg = Arg::new("http-port")
        .long("http-port")
        .required(false)
        .value_parser(value_parser!(u16))
        .value_name("http-port");
    let safekeepers_arg = Arg::new("safekeepers")
        .long("safekeepers")
        .required(false)
        .value_name("safekeepers");
    let stop_mode_arg = Arg::new("stop-mode")
        .short('m')
@@ -1093,7 +1130,8 @@ fn cli() -> Command {
                    .arg(branch_name_arg.clone())
                    .arg(tenant_id_arg.clone())
                    .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
+                    .arg(pg_port_arg.clone())
                    .arg(http_port_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1109,9 +1147,11 @@ fn cli() -> Command {
                    .arg(branch_name_arg)
                    .arg(timeline_id_arg)
                    .arg(lsn_arg)
-                    .arg(port_arg)
+                    .arg(pg_port_arg)
                    .arg(http_port_arg)
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,3 +1,9 @@
 //! Code to manage the storage broker
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
 //!   .neon/safekeepers/<safekeeper id>
 //!
 use anyhow::Context;
 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -1,41 +1,74 @@
 //! Code to manage compute endpoints
 //!
 //! In the local test environment, the data for each endpoint is stored in
 //!
 //!   .neon/endpoints/<endpoint id>
 //!
 //! Some basic information about the endpoint, like the tenant and timeline IDs,
 //! are stored in the `endpoint.json` file. The `endpoint.json` file is created
 //! when the endpoint is created, and doesn't change afterwards.
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
 //! the basebackup from the pageserver to initialize the the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
 //! When an endpoint is created, a `postgresql.conf` file is also created in
 //! the endpoint's directory. The file can be modified before starting PostgreSQL.
 //! However, the `postgresql.conf` file in the endpoint directory is not used directly
 //! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
 //! copy of it in the data directory.
 //!
 //! Directory contents:
 //!
 //! ```ignore
 //! .neon/endpoints/main/
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
 //!     postgresql.conf           - postgresql settings
 //!     spec.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
 //!         zenith.signal
 //!         <other PostgreSQL files>
 //! ```
 //!
 use std::collections::BTreeMap;
 use std::fs::{self, File};
 use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::os::unix::fs::PermissionsExt;
 use std::path::PathBuf;
-use std::process::{Command, Stdio};
+use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use anyhow::{Context, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use utils::{
+use utils::id::{NodeId, TenantId, TimelineId};
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
 use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
-use compute_api::spec::ComputeMode;
+use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
 // contents of a endpoint.json file
 #[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
-    name: String,
+    endpoint_id: String,
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
-    port: u16,
+    pg_port: u16,
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
    skip_sync_safekeepers: Option<utils::lsn::Lsn>,
 }
 //
@@ -57,11 +90,11 @@ impl ComputeControlPlane {
        let pageserver = Arc::new(PageServerNode::from_env(&env));
        let mut endpoints = BTreeMap::default();
-        for endpoint_dir in fs::read_dir(env.endpoints_path())
+        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
-            endpoints.insert(ep.name.clone(), Arc::new(ep));
+            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }
        Ok(ComputeControlPlane {
@@ -76,47 +109,60 @@ impl ComputeControlPlane {
        1 + self
            .endpoints
            .values()
-            .map(|ep| ep.address.port())
+            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
            .max()
            .unwrap_or(self.base_port)
    }
    #[allow(clippy::too_many_arguments)]
    pub fn new_endpoint(
        &mut self,
        endpoint_id: &str,
        tenant_id: TenantId,
        name: &str,
        timeline_id: TimelineId,
-        port: Option<u16>,
+        pg_port: Option<u16>,
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
    ) -> Result<Arc<Endpoint>> {
-        let port = port.unwrap_or_else(|| self.get_port());
+        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
-
+        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
        let ep = Arc::new(Endpoint {
-            name: name.to_owned(),
+            endpoint_id: endpoint_id.to_owned(),
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            timeline_id,
            mode,
            tenant_id,
            pg_version,
            skip_pg_catalog_updates: false,
            skip_sync_safekeepers: None,
        });
-        ep.create_pgdata()?;
+
        ep.create_endpoint_dir()?;
        std::fs::write(
            ep.endpoint_path().join("endpoint.json"),
            serde_json::to_string_pretty(&EndpointConf {
-                name: name.to_string(),
+                endpoint_id: endpoint_id.to_string(),
                tenant_id,
                timeline_id,
                mode,
-                port,
+                http_port,
                pg_port,
                pg_version,
                skip_pg_catalog_updates: false,
                skip_sync_safekeepers: None,
            })?,
        )?;
-        ep.setup_pg_conf()?;
+        std::fs::write(
            ep.endpoint_path().join("postgresql.conf"),
            ep.setup_pg_conf()?.to_string(),
        )?;
-        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
+        self.endpoints
            .insert(ep.endpoint_id.clone(), Arc::clone(&ep));
        Ok(ep)
    }
@@ -127,13 +173,15 @@ impl ComputeControlPlane {
 #[derive(Debug)]
 pub struct Endpoint {
    /// used as the directory name
-    name: String,
+    endpoint_id: String,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mode: ComputeMode,
-    // port and address of the Postgres server
+    // port and address of the Postgres server and `compute_ctl`'s HTTP API
-    pub address: SocketAddr,
+    pub pg_address: SocketAddr,
    pub http_address: SocketAddr,
    // postgres major version in the format: 14, 15, etc.
    pg_version: u32,
@@ -141,6 +189,10 @@ pub struct Endpoint {
    // the endpoint runs in.
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
    // Optimizations
    skip_pg_catalog_updates: bool,
    skip_sync_safekeepers: Option<utils::lsn::Lsn>,
 }
 impl Endpoint {
@@ -158,123 +210,38 @@ impl Endpoint {
        // parse data directory name
        let fname = entry.file_name();
-        let name = fname.to_str().unwrap().to_string();
+        let endpoint_id = fname.to_str().unwrap().to_string();
        // Read the endpoint.json file
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
        // ok now
        Ok(Endpoint {
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
-            name,
+            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
            skip_sync_safekeepers: conf.skip_sync_safekeepers,
        })
    }
-    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
+    fn create_endpoint_dir(&self) -> Result<()> {
-        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
+        std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
        let mut cmd = Command::new(pg_path);
        cmd.arg("--sync-safekeepers")
            .env_clear()
            .env(
                "LD_LIBRARY_PATH",
                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env(
                "DYLD_LIBRARY_PATH",
                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
            )
            .env("PGDATA", self.pgdata().to_str().unwrap())
            .stdout(Stdio::piped())
            // Comment this to avoid capturing stderr (useful if command hangs)
            .stderr(Stdio::piped());
        if let Some(token) = auth_token {
            cmd.env("NEON_AUTH_TOKEN", token);
        }
        let sync_handle = cmd
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");
        let sync_output = sync_handle
            .wait_with_output()
            .expect("postgres --sync-safekeepers failed");
        if !sync_output.status.success() {
            anyhow::bail!(
                "sync-safekeepers failed: '{}'",
                String::from_utf8_lossy(&sync_output.stderr)
            );
        }
        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
        println!("Safekeepers synced on {}", lsn);
        Ok(lsn)
    }
    /// Get basebackup from the pageserver as a tar archive and extract it
    /// to the `self.pgdata()` directory.
    fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
        println!(
            "Extracting base backup to create postgres instance: path={} port={}",
            self.pgdata().display(),
            self.address.port()
        );
        let sql = if let Some(lsn) = lsn {
            format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
        } else {
            format!("basebackup {} {}", self.tenant_id, self.timeline_id)
        };
        let mut client = self
            .pageserver
            .page_server_psql_client()
            .context("connecting to page server failed")?;
        let copyreader = client
            .copy_out(sql.as_str())
            .context("page server 'basebackup' command failed")?;
        // Read the archive directly from the `CopyOutReader`
        //
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
        // doesn't stop at the end-of-archive marker. Otherwise, if the server
        // sends an Error after finishing the tarball, we will not notice it.
        let mut ar = tar::Archive::new(copyreader);
        ar.set_ignore_zeros(true);
        ar.unpack(&self.pgdata())
            .context("extracting base backup failed")?;
        Ok(())
    }
    fn create_pgdata(&self) -> Result<()> {
        fs::create_dir_all(self.pgdata()).with_context(|| {
            format!(
-                "could not create data directory {}",
+                "could not create endpoint directory {}",
-                self.pgdata().display()
+                self.endpoint_path().display()
            )
-        })?;
+        })
        fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
            .with_context(|| {
                format!(
                    "could not set permissions in data directory {}",
                    self.pgdata().display()
                )
            })
    }
-    // Write postgresql.conf with default configuration
+    // Generate postgresql.conf with default configuration
-    // and PG_VERSION file to the data directory of a new endpoint.
+    fn setup_pg_conf(&self) -> Result<PostgresConf> {
    fn setup_pg_conf(&self) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
        conf.append("wal_log_hints", "off");
@@ -287,25 +254,14 @@ impl Endpoint {
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
-        conf.append("listen_addresses", &self.address.ip().to_string());
+        conf.append("listen_addresses", &self.pg_address.ip().to_string());
-        conf.append("port", &self.address.port().to_string());
+        conf.append("port", &self.pg_address.port().to_string());
        conf.append("wal_keep_size", "0");
        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
        conf.append("restart_after_crash", "off");
-        // Configure the Neon Postgres extension to fetch pages from pageserver
+        // Load the 'neon' extension
        let pageserver_connstr = {
            let config = &self.pageserver.pg_connection_config;
            let (host, port) = (config.host(), config.port());
            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
            format!("postgresql://no_user@{host}:{port}")
        };
        conf.append("shared_preload_libraries", "neon");
        conf.append_line("");
        conf.append("neon.pageserver_connstring", &pageserver_connstr);
        conf.append("neon.tenant_id", &self.tenant_id.to_string());
        conf.append("neon.timeline_id", &self.timeline_id.to_string());
        conf.append_line("");
        // Replication-related configurations, such as WAL sending
@@ -390,46 +346,11 @@ impl Endpoint {
            }
        }
-        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
+        Ok(conf)
        file.write_all(conf.to_string().as_bytes())?;
        let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
        file.write_all(self.pg_version.to_string().as_bytes())?;
        Ok(())
    }
    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
        let backup_lsn = match &self.mode {
            ComputeMode::Primary => {
                if !self.env.safekeepers.is_empty() {
                    // LSN 0 means that it is bootstrap and we need to download just
                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
                    // procedure evolves quite actively right now, so let's think about it again
                    // when things would be more stable (TODO).
                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
                    if lsn == Lsn(0) {
                        None
                    } else {
                        Some(lsn)
                    }
                } else {
                    None
                }
            }
            ComputeMode::Static(lsn) => Some(*lsn),
            ComputeMode::Replica => {
                None // Take the latest snapshot available to start with
            }
        };
        self.do_basebackup(backup_lsn)?;
        Ok(())
    }
    pub fn endpoint_path(&self) -> PathBuf {
-        self.env.endpoints_path().join(&self.name)
+        self.env.endpoints_path().join(&self.endpoint_id)
    }
    pub fn pgdata(&self) -> PathBuf {
@@ -439,7 +360,7 @@ impl Endpoint {
    pub fn status(&self) -> &str {
        let timeout = Duration::from_millis(300);
        let has_pidfile = self.pgdata().join("postmaster.pid").exists();
-        let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
+        let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
        match (has_pidfile, can_connect) {
            (true, true) => "running",
@@ -457,8 +378,6 @@ impl Endpoint {
                &[
                    "-D",
                    self.pgdata().to_str().unwrap(),
                    "-l",
                    self.pgdata().join("pg.log").to_str().unwrap(),
                    "-w", //wait till pg_ctl actually does what was asked
                ],
                args,
@@ -494,36 +413,185 @@ impl Endpoint {
        Ok(())
    }
-    pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
+    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
-        // 1. We always start Postgres from scratch, so
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // if old dir exists, preserve 'postgresql.conf' and drop the directory
+        // memory. We will include it in the spec file that we pass to
-        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
+        // in the data directory.
-            format!(
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-                "failed to read config file in {}",
+        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
-                postgresql_conf_path.to_str().unwrap()
+            Ok(content) => String::from_utf8(content)?,
-            )
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
-        })?;
+            Err(e) => {
-        fs::remove_dir_all(self.pgdata())?;
+                return Err(anyhow::Error::new(e).context(format!(
-        self.create_pgdata()?;
+                    "failed to read config file in {}",
                    postgresql_conf_path.to_str().unwrap()
                )))
            }
        };
-        // 2. Bring back config files
+        // We always start the compute node from scratch, so if the Postgres
-        fs::write(&postgresql_conf_path, postgresql_conf)?;
+        // data dir exists from a previous launch, remove it first.
-
+        if self.pgdata().exists() {
-        // 3. Load basebackup
+            std::fs::remove_dir_all(self.pgdata())?;
        self.load_basebackup(auth_token)?;
        if self.mode != ComputeMode::Primary {
            File::create(self.pgdata().join("standby.signal"))?;
        }
-        // 4. Finally start postgres
+        let pageserver_connstring = {
-        println!("Starting postgres at '{}'", self.connstr());
+            let config = &self.pageserver.pg_connection_config;
-        self.pg_ctl(&["start"], auth_token)
+            let (host, port) = (config.host(), config.port());
            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
            format!("postgresql://no_user@{host}:{port}")
        };
        let mut safekeeper_connstrings = Vec::new();
        if self.mode == ComputeMode::Primary {
            for sk_id in safekeepers {
                let sk = self
                    .env
                    .safekeepers
                    .iter()
                    .find(|node| node.id == sk_id)
                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
            }
        }
        // Create spec file
        let spec = ComputeSpec {
            skip_sync_safekeepers: self.skip_sync_safekeepers,
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
                state: None,
                roles: vec![],
                databases: vec![],
                settings: None,
                postgresql_conf: Some(postgresql_conf),
            },
            delta_operations: None,
            tenant_id: Some(self.tenant_id),
            timeline_id: Some(self.timeline_id),
            mode: self.mode,
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
        // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
        let logfile = std::fs::OpenOptions::new()
            .create(true)
            .append(true)
            .open(self.endpoint_path().join("compute.log"))?;
        // Launch compute_ctl
        println!("Starting postgres node at '{}'", self.connstr());
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
        cmd.args(["--http-port", &self.http_address.port().to_string()])
            .args(["--pgdata", self.pgdata().to_str().unwrap()])
            .args(["--connstr", &self.connstr()])
            .args([
                "--spec-path",
                self.endpoint_path().join("spec.json").to_str().unwrap(),
            ])
            .args([
                "--pgbin",
                self.env
                    .pg_bin_dir(self.pg_version)?
                    .join("postgres")
                    .to_str()
                    .unwrap(),
            ])
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
        let _child = cmd.spawn()?;
        // Wait for it to start
        let mut attempt = 0;
        const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
        loop {
            attempt += 1;
            match self.get_status() {
                Ok(state) => {
                    match state.status {
                        ComputeStatus::Init => {
                            if attempt == MAX_ATTEMPTS {
                                bail!("compute startup timed out; still in Init state");
                            }
                            // keep retrying
                        }
                        ComputeStatus::Running => {
                            // All good!
                            break;
                        }
                        ComputeStatus::Failed => {
                            bail!(
                                "compute startup failed: {}",
                                state
                                    .error
                                    .as_deref()
                                    .unwrap_or("<no error from compute_ctl>")
                            );
                        }
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
                        | ComputeStatus::Configuration => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
                    }
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
                        return Err(e).context(
                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
                        );
                    }
                }
            }
            std::thread::sleep(ATTEMPT_INTERVAL);
        }
        Ok(())
    }
    // Call the /status HTTP API
    pub fn get_status(&self) -> Result<ComputeState> {
        let client = reqwest::blocking::Client::new();
        let response = client
            .request(
                reqwest::Method::GET,
                format!(
                    "http://{}:{}/status",
                    self.http_address.ip(),
                    self.http_address.port()
                ),
            )
            .send()?;
        // Interpret the response
        let status = response.status();
        if !(status.is_client_error() || status.is_server_error()) {
            Ok(response.json()?)
        } else {
            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
            let url = response.url().to_owned();
            let msg = match response.text() {
                Ok(err_body) => format!("Error: {}", err_body),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
            Err(anyhow::anyhow!(msg))
        }
    }
    pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -540,7 +608,7 @@ impl Endpoint {
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
-            fs::remove_dir_all(self.endpoint_path())?;
+            std::fs::remove_dir_all(self.endpoint_path())?;
        } else {
            self.pg_ctl(&["stop"], &None)?;
        }
@@ -549,10 +617,10 @@ impl Endpoint {
    pub fn connstr(&self) -> String {
        format!(
-            "host={} port={} user={} dbname={}",
+            "postgresql://{}@{}:{}/{}",
            self.address.ip(),
            self.address.port(),
            "cloud_admin",
            self.pg_address.ip(),
            self.pg_address.port(),
            "postgres"
        )
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
-    // compute nodes).
+    // compute endpoints).
    //
    // This is not stored in the config file. Rather, this is the path where the
    // config file itself is. It is read from the NEON_REPO_DIR env variable or
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,3 +1,9 @@
 //! Code to manage pageservers
 //!
 //! In the local test environment, the pageserver stores its data directly in
 //!
 //!   .neon/
 //!
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fs::File;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,3 +1,9 @@
 //! Code to manage safekeepers
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
 //!   .neon/safekeepers/<safekeeper id>
 //!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};
 use crate::spec::ComputeSpec;
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
    pub error: String,
 }
 /// Response of the /status API
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
    pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }
-#[derive(Serialize)]
+#[derive(Deserialize, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeState {
    pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
    pub error: Option<String>,
 }
-#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
    // Spec wasn't provided at start, waiting for it to be
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -5,6 +5,7 @@
 //! and connect it to the storage nodes.
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 /// String type alias representing Postgres identifier and
@@ -14,7 +15,7 @@ pub type PgIdent = String;
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[serde_as]
-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -26,9 +27,47 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
    /// An optinal hint that can be passed to speed up startup time if we know
    /// that no pg catalog mutations (like role creation, database creation,
    /// extension creation) need to be done on the actual database to start.
    #[serde(default)] // Default false
    pub skip_pg_catalog_updates: bool,
    /// An optinal hint that can be passed to speed up startup time if we know
    /// that safekeepers have already been synced at the given LSN.
    ///
    /// NOTE: If there's any possibility that the safekeepers could have advanced
    ///       (e.g. if we started compute, and it crashed) we should stay on the
    ///       safe side and provide None.
    #[serde(default)]
    pub skip_sync_safekeepers: Option<Lsn>,
    // Information needed to connect to the storage layer.
    //
    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
    //
    // Depending on `mode`, this can be a primary read-write node, a read-only
    // replica, or a read-only node pinned at an older LSN.
    // `safekeeper_connstrings` must be set for a primary.
    //
    // For backwards compatibility, the control plane may leave out all of
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub timeline_id: Option<TimelineId>,
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub pageserver_connstring: Option<String>,
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,
    #[serde(default)]
    pub mode: ComputeMode,
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
 }
@@ -47,13 +86,19 @@ pub enum ComputeMode {
    Replica,
 }
-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct Cluster {
-    pub cluster_id: String,
+    pub cluster_id: Option<String>,
-    pub name: String,
+    pub name: Option<String>,
    pub state: Option<String>,
    pub roles: Vec<Role>,
    pub databases: Vec<Database>,
    /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
    /// tool may add additional settings to the final file.)
    pub postgresql_conf: Option<String>,
    /// Additional settings that will be appended to the 'postgresql.conf' file.
    pub settings: GenericOptions,
 }
@@ -63,7 +108,7 @@ pub struct Cluster {
 /// - DROP ROLE
 /// - ALTER ROLE name RENAME TO new_name
 /// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct DeltaOp {
    pub action: String,
    pub name: PgIdent,
@@ -72,7 +117,7 @@ pub struct DeltaOp {
 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
@@ -81,7 +126,7 @@ pub struct Role {
 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
@@ -91,7 +136,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct GenericOption {
    pub name: String,
    pub value: Option<String>,
@@ -112,4 +157,14 @@ mod tests {
        let file = File::open("tests/cluster_spec.json").unwrap();
        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
    }
    #[test]
    fn parse_unknown_fields() {
        // Forward compatibility test
        let file = File::open("tests/cluster_spec.json").unwrap();
        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
        let ob = json.as_object_mut().unwrap();
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -23,6 +23,7 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 pub mod metric_vec_duration;
 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -0,0 +1,23 @@
 //! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
 use std::{future::Future, time::Instant};
 pub trait DurationResultObserver {
    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
 }
 pub async fn observe_async_block_duration_by_result<
    T,
    E,
    F: Future<Output = Result<T, E>>,
    O: DurationResultObserver,
 >(
    observer: &O,
    block: F,
 ) -> Result<T, E> {
    let start = Instant::now();
    let result = block.await;
    let duration = start.elapsed();
    observer.observe_result(&result, duration);
    result
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -110,12 +110,11 @@ impl TenantState {
            Self::Active => Attached,
            // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
            // However, it also becomes Broken if the regular load fails.
-            // We would need a separate TenantState variant to distinguish these cases.
+            // From Console's perspective there's no practical difference
-            // However, there's no practical difference from Console's perspective.
+            // because attachment_status is polled by console only during attach operation execution.
-            // It will run a Postgres-level health check as soon as it observes Attached.
+            Self::Broken { reason, .. } => Failed {
-            // That will fail on Broken tenants.
+                reason: reason.to_owned(),
-            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
+            },
            Self::Broken { .. } => Attached,
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
@@ -153,7 +152,7 @@ pub enum ActivatingFrom {
 }
 /// A state of a timeline in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
    /// The timeline is recognized by the pageserver but is not yet operational.
    /// In particular, the walreceiver connection loop is not running for this timeline.
@@ -166,7 +165,7 @@ pub enum TimelineState {
    /// It cannot transition back into any other state.
    Stopping,
    /// The timeline is broken and not operational (previous states: Loading or Active).
-    Broken,
+    Broken { reason: String, backtrace: String },
 }
 #[serde_as]
@@ -312,10 +311,11 @@ impl std::ops::Deref for TenantAttachConfig {
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
-#[serde(rename_all = "snake_case")]
+#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
 pub enum TenantAttachmentStatus {
    Maybe,
    Attached,
    Failed { reason: String },
 }
 #[serde_as]
@@ -809,7 +809,9 @@ mod tests {
                "slug": "Active",
            },
            "current_physical_size": 42,
-            "attachment_status": "attached",
+            "attachment_status": {
                "slug":"attached",
            }
        });
        let original_broken = TenantInfo {
@@ -831,7 +833,9 @@ mod tests {
                }
            },
            "current_physical_size": 42,
-            "attachment_status": "attached",
+            "attachment_status": {
                "slug":"attached",
            }
        });
        assert_eq!(
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -24,7 +24,6 @@ workspace_hack.workspace = true
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
 wal_craft = { path = "wal_craft" }
 [build-dependencies]
 anyhow.workspace = true
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -33,6 +33,7 @@ macro_rules! postgres_ffi {
            }
            pub mod controlfile_utils;
            pub mod nonrelfile_utils;
            pub mod wal_craft_test_export;
            pub mod waldecoder_handler;
            pub mod xlog_utils;
@@ -45,8 +46,15 @@ macro_rules! postgres_ffi {
    };
 }
-postgres_ffi!(v14);
+#[macro_export]
-postgres_ffi!(v15);
+macro_rules! for_all_postgres_versions {
    ($macro:tt) => {
        $macro!(v14);
        $macro!(v15);
    };
 }
 for_all_postgres_versions! { postgres_ffi }
 pub mod pg_constants;
 pub mod relfile_utils;
--- a/libs/postgres_ffi/src/wal_craft_test_export.rs
+++ b/libs/postgres_ffi/src/wal_craft_test_export.rs
@@ -0,0 +1,6 @@
 //! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
 pub use super::PG_MAJORVERSION;
 pub use super::xlog_utils::*;
 pub use super::bindings::*;
 pub use crate::WAL_SEGMENT_SIZE;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -481,220 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }
-#[cfg(test)]
+// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
 mod tests {
    use super::super::PG_MAJORVERSION;
    use super::*;
    use regex::Regex;
    use std::cmp::min;
    use std::fs;
    use std::{env, str::FromStr};
    use utils::const_assert;
    fn init_logging() {
        let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
            format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
        ))
        .is_test(true)
        .try_init();
    }
    fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
        use wal_craft::*;
        let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
        // Craft some WAL
        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("..")
            .join("..");
        let cfg = Conf {
            pg_version,
            pg_distrib_dir: top_path.join("pg_install"),
            datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
        };
        if cfg.datadir.exists() {
            fs::remove_dir_all(&cfg.datadir).unwrap();
        }
        cfg.initdb().unwrap();
        let srv = cfg.start_server().unwrap();
        let (intermediate_lsns, expected_end_of_wal_partial) =
            C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
        let intermediate_lsns: Vec<Lsn> = intermediate_lsns
            .iter()
            .map(|&lsn| u64::from(lsn).into())
            .collect();
        let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
        srv.kill();
        // Check find_end_of_wal on the initial WAL
        let last_segment = cfg
            .wal_dir()
            .read_dir()
            .unwrap()
            .map(|f| f.unwrap().file_name().into_string().unwrap())
            .filter(|fname| IsXLogFileName(fname))
            .max()
            .unwrap();
        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
        for start_lsn in intermediate_lsns
            .iter()
            .chain(std::iter::once(&expected_end_of_wal))
        {
            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
            // We assume that `start_lsn` is non-decreasing.
            info!(
                "Checking with start_lsn={}, erasing WAL before it",
                start_lsn
            );
            for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
                let fname = file.file_name().into_string().unwrap();
                if !IsXLogFileName(&fname) {
                    continue;
                }
                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
                if seg_start_lsn > u64::from(*start_lsn) {
                    continue;
                }
                let mut f = File::options().write(true).open(file.path()).unwrap();
                const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
                f.write_all(
                    &ZEROS[0..min(
                        WAL_SEGMENT_SIZE,
                        (u64::from(*start_lsn) - seg_start_lsn) as usize,
                    )],
                )
                .unwrap();
            }
            check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
        }
    }
    fn check_pg_waldump_end_of_wal(
        cfg: &wal_craft::Conf,
        last_segment: &str,
        expected_end_of_wal: Lsn,
    ) {
        // Get the actual end of WAL by pg_waldump
        let waldump_output = cfg
            .pg_waldump("000000010000000000000001", last_segment)
            .unwrap()
            .stderr;
        let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
        let caps = match Regex::new(r"invalid record length at (.+):")
            .unwrap()
            .captures(waldump_output)
        {
            Some(caps) => caps,
            None => {
                error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
                panic!();
            }
        };
        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
        info!(
            "waldump erred on {}, expected wal end at {}",
            waldump_wal_end, expected_end_of_wal
        );
        assert_eq!(waldump_wal_end, expected_end_of_wal);
    }
    fn check_end_of_wal(
        cfg: &wal_craft::Conf,
        last_segment: &str,
        start_lsn: Lsn,
        expected_end_of_wal: Lsn,
    ) {
        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
        // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
        // info!(
        //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
        //     wal_end
        // );
        // assert_eq!(wal_end, expected_end_of_wal_non_partial);
        // Rename file to partial to actually find last valid lsn, then rename it back.
        fs::rename(
            cfg.wal_dir().join(last_segment),
            cfg.wal_dir().join(format!("{}.partial", last_segment)),
        )
        .unwrap();
        let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
        info!(
            "find_end_of_wal returned wal_end={} with partial WAL segment",
            wal_end
        );
        assert_eq!(wal_end, expected_end_of_wal);
        fs::rename(
            cfg.wal_dir().join(format!("{}.partial", last_segment)),
            cfg.wal_dir().join(last_segment),
        )
        .unwrap();
    }
    const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
    #[test]
    pub fn test_find_end_of_wal_simple() {
        init_logging();
        test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
    }
    #[test]
    pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
        init_logging();
        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
        );
    }
    #[test]
    pub fn test_find_end_of_wal_last_crossing_segment() {
        init_logging();
        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
            "test_find_end_of_wal_last_crossing_segment",
        );
    }
    /// Check the math in update_next_xid
    ///
    /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
    /// currently 1024.
    #[test]
    pub fn test_update_next_xid() {
        let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
        let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
        checkpoint.nextXid = FullTransactionId { value: 10 };
        assert_eq!(checkpoint.nextXid.value, 10);
        // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
        // boundary
        checkpoint.update_next_xid(100);
        assert_eq!(checkpoint.nextXid.value, 1024);
        // No change
        checkpoint.update_next_xid(500);
        assert_eq!(checkpoint.nextXid.value, 1024);
        checkpoint.update_next_xid(1023);
        assert_eq!(checkpoint.nextXid.value, 1024);
        // The function returns the *next* XID, given the highest XID seen so
        // far. So when we pass 1024, the nextXid gets bumped up to the next
        // XID_CHECKPOINT_INTERVAL boundary.
        checkpoint.update_next_xid(1024);
        assert_eq!(checkpoint.nextXid.value, 2048);
    }
    #[test]
    pub fn test_encode_logical_message() {
        let expected = [
            64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
            38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
            101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
        ];
        let actual = encode_logical_message("prefix", "message");
        assert_eq!(expected, actual[..]);
    }
 }
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -15,3 +15,7 @@ postgres_ffi.workspace = true
 tempfile.workspace = true
 workspace_hack.workspace = true
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -10,6 +10,20 @@ use std::process::Command;
 use std::time::{Duration, Instant};
 use tempfile::{tempdir, TempDir};
 macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
            mod xlog_utils_test;
        }
    };
 }
 postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
    pub pg_version: u32,
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -0,0 +1,219 @@
 //! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
 use super::*;
 use crate::{error, info};
 use regex::Regex;
 use std::cmp::min;
 use std::fs::{self, File};
 use std::io::Write;
 use std::{env, str::FromStr};
 use utils::const_assert;
 use utils::lsn::Lsn;
 fn init_logging() {
    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
    ))
    .is_test(true)
    .try_init();
 }
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;
    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
    // Craft some WAL
    let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .join("..")
        .join("..")
        .join("..");
    let cfg = Conf {
        pg_version,
        pg_distrib_dir: top_path.join("pg_install"),
        datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
    };
    if cfg.datadir.exists() {
        fs::remove_dir_all(&cfg.datadir).unwrap();
    }
    cfg.initdb().unwrap();
    let srv = cfg.start_server().unwrap();
    let (intermediate_lsns, expected_end_of_wal_partial) =
        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
        .iter()
        .map(|&lsn| u64::from(lsn).into())
        .collect();
    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
    srv.kill();
    // Check find_end_of_wal on the initial WAL
    let last_segment = cfg
        .wal_dir()
        .read_dir()
        .unwrap()
        .map(|f| f.unwrap().file_name().into_string().unwrap())
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
    for start_lsn in intermediate_lsns
        .iter()
        .chain(std::iter::once(&expected_end_of_wal))
    {
        // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
        // We assume that `start_lsn` is non-decreasing.
        info!(
            "Checking with start_lsn={}, erasing WAL before it",
            start_lsn
        );
        for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
            let fname = file.file_name().into_string().unwrap();
            if !IsXLogFileName(&fname) {
                continue;
            }
            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
            let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
            if seg_start_lsn > u64::from(*start_lsn) {
                continue;
            }
            let mut f = File::options().write(true).open(file.path()).unwrap();
            const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
            f.write_all(
                &ZEROS[0..min(
                    WAL_SEGMENT_SIZE,
                    (u64::from(*start_lsn) - seg_start_lsn) as usize,
                )],
            )
            .unwrap();
        }
        check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
    }
 }
 fn check_pg_waldump_end_of_wal(
    cfg: &crate::Conf,
    last_segment: &str,
    expected_end_of_wal: Lsn,
 ) {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
        .pg_waldump("000000010000000000000001", last_segment)
        .unwrap()
        .stderr;
    let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
    let caps = match Regex::new(r"invalid record length at (.+):")
        .unwrap()
        .captures(waldump_output)
    {
        Some(caps) => caps,
        None => {
            error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
            panic!();
        }
    };
    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
    info!(
        "waldump erred on {}, expected wal end at {}",
        waldump_wal_end, expected_end_of_wal
    );
    assert_eq!(waldump_wal_end, expected_end_of_wal);
 }
 fn check_end_of_wal(
    cfg: &crate::Conf,
    last_segment: &str,
    start_lsn: Lsn,
    expected_end_of_wal: Lsn,
 ) {
    // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
    // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
    // info!(
    //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
    //     wal_end
    // );
    // assert_eq!(wal_end, expected_end_of_wal_non_partial);
    // Rename file to partial to actually find last valid lsn, then rename it back.
    fs::rename(
        cfg.wal_dir().join(last_segment),
        cfg.wal_dir().join(format!("{}.partial", last_segment)),
    )
    .unwrap();
    let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
    info!(
        "find_end_of_wal returned wal_end={} with partial WAL segment",
        wal_end
    );
    assert_eq!(wal_end, expected_end_of_wal);
    fs::rename(
        cfg.wal_dir().join(format!("{}.partial", last_segment)),
        cfg.wal_dir().join(last_segment),
    )
    .unwrap();
 }
 const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
 #[test]
 pub fn test_find_end_of_wal_simple() {
    init_logging();
    test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
 }
 #[test]
 pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
    init_logging();
    test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
        "test_find_end_of_wal_crossing_segment_followed_by_small_one",
    );
 }
 #[test]
 pub fn test_find_end_of_wal_last_crossing_segment() {
    init_logging();
    test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
        "test_find_end_of_wal_last_crossing_segment",
    );
 }
 /// Check the math in update_next_xid
 ///
 /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
 /// currently 1024.
 #[test]
 pub fn test_update_next_xid() {
    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
    checkpoint.nextXid = FullTransactionId { value: 10 };
    assert_eq!(checkpoint.nextXid.value, 10);
    // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
    // boundary
    checkpoint.update_next_xid(100);
    assert_eq!(checkpoint.nextXid.value, 1024);
    // No change
    checkpoint.update_next_xid(500);
    assert_eq!(checkpoint.nextXid.value, 1024);
    checkpoint.update_next_xid(1023);
    assert_eq!(checkpoint.nextXid.value, 1024);
    // The function returns the *next* XID, given the highest XID seen so
    // far. So when we pass 1024, the nextXid gets bumped up to the next
    // XID_CHECKPOINT_INTERVAL boundary.
    checkpoint.update_next_xid(1024);
    assert_eq!(checkpoint.nextXid.value, 2048);
 }
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
    ];
    let actual = encode_logical_message("prefix", "message");
    assert_eq!(expected, actual[..]);
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,6 +70,14 @@ impl RemotePath {
    pub fn join(&self, segment: &Path) -> Self {
        Self(self.0.join(segment))
    }
    pub fn get_path(&self) -> &PathBuf {
        &self.0
    }
    pub fn extension(&self) -> Option<&str> {
        self.0.extension()?.to_str()
    }
 }
 /// Storage (potentially remote) API to manage its state.
@@ -86,6 +94,19 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError>;
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
    /// because it is for listing files instead of listing
    /// names sharing common prefixes.
    /// For example,
    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
@@ -111,6 +132,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<Download, DownloadError>;
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }
 pub struct Download {
@@ -172,6 +195,14 @@ impl GenericRemoteStorage {
        }
    }
    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        match self {
            Self::LocalFs(s) => s.list_files(folder).await,
            Self::AwsS3(s) => s.list_files(folder).await,
            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -223,6 +254,14 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        match self {
            Self::LocalFs(s) => s.delete_objects(paths).await,
            Self::AwsS3(s) => s.delete_objects(paths).await,
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
 }
 impl GenericRemoteStorage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe::path_with_suffix_extension;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 use crate::{Download, DownloadError, RemotePath};
@@ -48,6 +48,14 @@ impl LocalFs {
        Ok(Self { storage_root })
    }
    // mirrors S3Bucket::s3_object_to_relative_path
    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
        let relative_path = key
            .strip_prefix(&self.storage_root)
            .expect("relative path must contain storage_root as prefix");
        RemotePath(relative_path.into())
    }
    async fn read_storage_metadata(
        &self,
        file_path: &Path,
@@ -101,19 +109,63 @@ impl RemoteStorage for LocalFs {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
-        Ok(get_all_files(path.as_ref(), false)
+
        let prefixes_to_filter = get_all_files(path.as_ref(), false)
            .await
-            .map_err(DownloadError::Other)?
+            .map_err(DownloadError::Other)?;
-            .into_iter()
+
-            .map(|path| {
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
-                path.strip_prefix(&self.storage_root)
+
-                    .context("Failed to strip preifix")
+        // filter out empty directories to mirror s3 behavior.
        for prefix in prefixes_to_filter {
            if prefix.is_dir()
                && is_directory_empty(&prefix)
                    .await
                    .map_err(DownloadError::Other)?
            {
                continue;
            }
            prefixes.push(
                prefix
                    .strip_prefix(&self.storage_root)
                    .context("Failed to strip prefix")
                    .and_then(RemotePath::new)
                    .expect(
                        "We list files for storage root, hence should be able to remote the prefix",
-                    )
+                    ),
-            })
+            )
-            .collect())
+        }
        Ok(prefixes)
    }
    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
        };
        let mut files = vec![];
        let mut directory_queue = vec![full_path.clone()];
        while !directory_queue.is_empty() {
            let cur_folder = directory_queue
                .pop()
                .expect("queue cannot be empty: we just checked");
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
                let full_file_name = cur_folder.clone().join(&file_name);
                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
                files.push(file_remote_path.clone());
                if full_file_name.is_dir() {
                    directory_queue.push(full_file_name);
                }
            }
        }
        Ok(files)
    }
    async fn upload(
@@ -291,11 +343,25 @@ impl RemoteStorage for LocalFs {
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
-        if file_path.exists() && file_path.is_file() {
+        if !file_path.exists() {
-            Ok(fs::remove_file(file_path).await?)
+            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
-        } else {
+            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
-            bail!("File {file_path:?} either does not exist or is not a file")
+            return Ok(());
        }
        if !file_path.is_file() {
            anyhow::bail!("{file_path:?} is not a file");
        }
        Ok(fs::remove_file(file_path)
            .await
            .map_err(|e| anyhow::anyhow!(e))?)
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        for path in paths {
            self.delete(path).await?
        }
        Ok(())
    }
 }
@@ -320,7 +386,7 @@ where
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} us a symlink, skipping")
+                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -595,15 +661,11 @@ mod fs_tests {
        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());
-        match storage.delete(&upload_target).await {
+        storage
-            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
+            .delete(&upload_target)
-            Err(e) => {
+            .await
-                let error_string = e.to_string();
+            .expect("Should allow deleting non-existing storage files");
-                assert!(error_string.contains("does not exist"));
+
                let expected_path = upload_target.with_base(&storage.storage_root);
                assert!(error_string.contains(expected_path.to_str().unwrap()));
            }
        }
        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -17,6 +17,7 @@ use aws_sdk_s3::{
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_http::body::SdkBody;
@@ -33,6 +34,8 @@ use crate::{
    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
 pub(super) mod metrics {
    use metrics::{register_int_counter_vec, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -81,12 +84,24 @@ pub(super) mod metrics {
            .inc();
    }
    pub fn inc_delete_objects(count: u64) {
        S3_REQUESTS_COUNT
            .with_label_values(&["delete_object"])
            .inc_by(count);
    }
    pub fn inc_delete_object_fail() {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc();
    }
    pub fn inc_delete_objects_fail(count: u64) {
        S3_REQUESTS_FAIL_COUNT
            .with_label_values(&["delete_object"])
            .inc_by(count);
    }
    pub fn inc_list_objects() {
        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
    }
@@ -332,6 +347,51 @@ impl RemoteStorage for S3Bucket {
        Ok(document_keys)
    }
    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
        // AWS may need to break the response into several parts
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
            let _guard = self
                .concurrency_limiter
                .acquire()
                .await
                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
            metrics::inc_list_objects();
            let response = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(folder_name.clone())
                .set_continuation_token(continuation_token)
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
                .map_err(|e| {
                    metrics::inc_list_objects_fail();
                    e
                })
                .context("Failed to list files in S3 bucket")?;
            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
                let remote_path = self.s3_object_to_relative_path(object_path);
                all_files.push(remote_path);
            }
            match response.next_continuation_token {
                Some(new_token) => continuation_token = Some(new_token),
                None => break,
            }
        }
        Ok(all_files)
    }
    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -396,6 +456,50 @@ impl RemoteStorage for S3Bucket {
        })
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        let _guard = self
            .concurrency_limiter
            .acquire()
            .await
            .context("Concurrency limiter semaphore got closed during S3 delete")?;
        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
            let obj_id = ObjectIdentifier::builder()
                .set_key(Some(self.relative_path_to_s3_object(path)))
                .build();
            delete_objects.push(obj_id);
        }
        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
            metrics::inc_delete_objects(chunk.len() as u64);
            let resp = self
                .client
                .delete_objects()
                .bucket(self.bucket_name.clone())
                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
                .send()
                .await;
            match resp {
                Ok(resp) => {
                    if let Some(errors) = resp.errors {
                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
                        ));
                    }
                }
                Err(e) => {
                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
        }
        Ok(())
    }
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let _guard = self
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -24,6 +24,7 @@ enum RemoteOp {
    Upload(RemotePath),
    Download(RemotePath),
    Delete(RemotePath),
    DeleteObjects(Vec<RemotePath>),
 }
 impl UnreliableWrapper {
@@ -82,6 +83,11 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_prefixes(prefix).await
    }
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
        self.inner.list_files(folder).await
    }
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -119,4 +125,21 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Delete(path.clone()))?;
        self.inner.delete(path).await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
            if (self.delete(path).await).is_err() {
                error_counter += 1;
            }
        }
        if error_counter > 0 {
            return Err(anyhow::anyhow!(
                "failed to delete {} objects",
                error_counter
            ));
        }
        Ok(())
    }
 }
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -1,274 +0,0 @@
 use std::collections::HashSet;
 use std::env;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 use anyhow::Context;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
 use tokio::task::JoinSet;
 use tracing::{debug, error, info};
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
 /// See the client creation in [`create_s3_client`] for details on the required env vars.
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
 /// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
 ///
 /// Then, verifies that the client does return correct prefixes when queried:
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
 /// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
 /// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
 /// since current default AWS S3 pagination limit is 1000.
 /// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
 #[test_context(MaybeEnabledS3)]
 #[tokio::test]
 async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3::Enabled(ctx) => ctx,
        MaybeEnabledS3::Disabled => return Ok(()),
        MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
    };
    let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();
    let base_prefix =
        RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
        .await
        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
    );
    let nested_remote_prefixes = test_client
        .list_prefixes(Some(&base_prefix))
        .await
        .context("client list nested prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
        .difference(&expected_remote_prefixes)
        .collect::<HashSet<_>>();
    let missing_uploaded_prefixes = expected_remote_prefixes
        .difference(&nested_remote_prefixes)
        .collect::<HashSet<_>>();
    assert_eq!(
        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
    );
    Ok(())
 }
 enum MaybeEnabledS3 {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
 }
 struct S3WithTestBlobs {
    client_with_excessive_pagination: Arc<GenericRemoteStorage>,
    base_prefix_str: &'static str,
    remote_prefixes: HashSet<RemotePath>,
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3 {
    async fn setup() -> Self {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
        )
        .expect("logging init failed");
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
        let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
            .context("S3 client creation")
            .expect("S3 client creation failed");
        let base_prefix_str = "test/";
        match upload_s3_data(
            &client_with_excessive_pagination,
            base_prefix_str,
            upload_tasks_count,
        )
        .await
        {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
                Self::Enabled(S3WithTestBlobs {
                    client_with_excessive_pagination,
                    base_prefix_str,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                })
            }
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithTestBlobs {
                    client_with_excessive_pagination,
                    base_prefix_str,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                },
            ),
        }
    }
    async fn teardown(self) {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
                cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
            }
        }
    }
 }
 fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
    let random_prefix_part = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
        .as_millis();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response: Some(max_keys_per_list_response),
        }),
    };
    Ok(Arc::new(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
 struct Uploads {
    prefixes: HashSet<RemotePath>,
    blobs: HashSet<RemotePath>,
 }
 async fn upload_s3_data(
    client: &Arc<GenericRemoteStorage>,
    base_prefix_str: &'static str,
    upload_tasks_count: usize,
 ) -> ControlFlow<Uploads, Uploads> {
    info!("Creating {upload_tasks_count} S3 files");
    let mut upload_tasks = JoinSet::new();
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
            let blob_prefix = RemotePath::new(&prefix)
                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");
            let data = format!("remote blob data {i}").into_bytes();
            let data_len = data.len();
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
    }
    let mut upload_tasks_failed = false;
    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
    while let Some(task_run_result) = upload_tasks.join_next().await {
        match task_run_result
            .context("task join failed")
            .and_then(|task_result| task_result.context("upload task failed"))
        {
            Ok((upload_prefix, upload_path)) => {
                uploaded_prefixes.insert(upload_prefix);
                uploaded_blobs.insert(upload_path);
            }
            Err(e) => {
                error!("Upload task failed: {e:?}");
                upload_tasks_failed = true;
            }
        }
    }
    let uploads = Uploads {
        prefixes: uploaded_prefixes,
        blobs: uploaded_blobs,
    };
    if upload_tasks_failed {
        ControlFlow::Break(uploads)
    } else {
        ControlFlow::Continue(uploads)
    }
 }
 async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
    info!(
        "Removing {} objects from the remote storage during cleanup",
        objects_to_delete.len()
    );
    let mut delete_tasks = JoinSet::new();
    for object_to_delete in objects_to_delete {
        let task_client = Arc::clone(client);
        delete_tasks.spawn(async move {
            debug!("Deleting remote item at path {object_to_delete:?}");
            task_client
                .delete(&object_to_delete)
                .await
                .with_context(|| format!("{object_to_delete:?} removal"))
        });
    }
    while let Some(task_run_result) = delete_tasks.join_next().await {
        match task_run_result {
            Ok(task_result) => match task_result {
                Ok(()) => {}
                Err(e) => error!("Delete task failed: {e:?}"),
            },
            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
        }
    }
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -0,0 +1,542 @@
 use std::collections::HashSet;
 use std::env;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 use anyhow::Context;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
 use tokio::task::JoinSet;
 use tracing::{debug, error, info};
 static LOGGING_DONE: OnceCell<()> = OnceCell::new();
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 const BASE_PREFIX: &str = "test/";
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
 /// See the client creation in [`create_s3_client`] for details on the required env vars.
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
 /// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
 ///
 /// Then, verifies that the client does return correct prefixes when queried:
 /// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
 /// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
 ///
 /// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
 /// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
 /// since current default AWS S3 pagination limit is 1000.
 /// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
 #[test_context(MaybeEnabledS3WithTestBlobs)]
 #[tokio::test]
 async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
    };
    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();
    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
        .await
        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
    );
    let nested_remote_prefixes = test_client
        .list_prefixes(Some(&base_prefix))
        .await
        .context("client list nested prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
        .difference(&expected_remote_prefixes)
        .collect::<HashSet<_>>();
    let missing_uploaded_prefixes = expected_remote_prefixes
        .difference(&nested_remote_prefixes)
        .collect::<HashSet<_>>();
    assert_eq!(
        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
    );
    Ok(())
 }
 /// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `s3_pagination_should_work` for more information.
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
 #[tokio::test]
 async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
            anyhow::bail!("S3 init failed: {e:?}")
        }
    };
    let test_client = Arc::clone(&ctx.enabled.client);
    let base_prefix =
        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
        .list_files(None)
        .await
        .context("client list root files failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
        "remote storage list_files on root mismatches with the uploads."
    );
    let nested_remote_files = test_client
        .list_files(Some(&base_prefix))
        .await
        .context("client list nested files failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
        .remote_blobs
        .iter()
        .map(|x| x.get_path().to_str().expect("must be valid name"))
        .filter(|x| x.starts_with("folder1"))
        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
        "remote storage list_files on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
 #[test_context(MaybeEnabledS3)]
 #[tokio::test]
 async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3::Enabled(ctx) => ctx,
        MaybeEnabledS3::Disabled => return Ok(()),
    };
    let path = RemotePath::new(&PathBuf::from(format!(
        "{}/for_sure_there_is_nothing_there_really",
        ctx.base_prefix,
    )))
    .with_context(|| "RemotePath conversion")?;
    ctx.client.delete(&path).await.expect("should succeed");
    Ok(())
 }
 #[test_context(MaybeEnabledS3)]
 #[tokio::test]
 async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledS3::Enabled(ctx) => ctx,
        MaybeEnabledS3::Disabled => return Ok(()),
    };
    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
        .with_context(|| "RemotePath conversion")?;
    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
        .with_context(|| "RemotePath conversion")?;
    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
        .with_context(|| "RemotePath conversion")?;
    let data1 = "remote blob data1".as_bytes();
    let data1_len = data1.len();
    let data2 = "remote blob data2".as_bytes();
    let data2_len = data2.len();
    let data3 = "remote blob data3".as_bytes();
    let data3_len = data3.len();
    ctx.client
        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
        .await?;
    ctx.client
        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
        .await?;
    ctx.client
        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
        .await?;
    ctx.client.delete_objects(&[path1, path2]).await?;
    let prefixes = ctx.client.list_prefixes(None).await?;
    assert_eq!(prefixes.len(), 1);
    ctx.client.delete_objects(&[path3]).await?;
    Ok(())
 }
 fn ensure_logging_ready() {
    LOGGING_DONE.get_or_init(|| {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
        )
        .expect("logging init failed");
    });
 }
 struct EnabledS3 {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
 }
 impl EnabledS3 {
    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
        let client = create_s3_client(max_keys_in_list_response)
            .context("S3 client creation")
            .expect("S3 client creation failed");
        EnabledS3 {
            client,
            base_prefix: BASE_PREFIX,
        }
    }
 }
 enum MaybeEnabledS3 {
    Enabled(EnabledS3),
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3 {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        Self::Enabled(EnabledS3::setup(None).await)
    }
 }
 enum MaybeEnabledS3WithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
 }
 struct S3WithTestBlobs {
    enabled: EnabledS3,
    remote_prefixes: HashSet<RemotePath>,
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
                Self::Enabled(S3WithTestBlobs {
                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                })
            }
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithTestBlobs {
                    enabled,
                    remote_prefixes: uploads.prefixes,
                    remote_blobs: uploads.blobs,
                },
            ),
        }
    }
    async fn teardown(self) {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
            }
        }
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledS3WithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
 }
 struct S3WithSimpleTestBlobs {
    enabled: EnabledS3,
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
            );
            return Self::Disabled;
        }
        let max_keys_in_list_response = 10;
        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
            ControlFlow::Continue(uploads) => {
                info!("Remote objects created successfully");
                Self::Enabled(S3WithSimpleTestBlobs {
                    enabled,
                    remote_blobs: uploads,
                })
            }
            ControlFlow::Break(uploads) => Self::UploadsFailed(
                anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                S3WithSimpleTestBlobs {
                    enabled,
                    remote_blobs: uploads,
                },
            ),
        }
    }
    async fn teardown(self) {
        match self {
            Self::Disabled => {}
            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
            }
        }
    }
 }
 fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
    let random_prefix_part = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
        .as_nanos();
    let remote_storage_config = RemoteStorageConfig {
        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
        }),
    };
    Ok(Arc::new(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
 struct Uploads {
    prefixes: HashSet<RemotePath>,
    blobs: HashSet<RemotePath>,
 }
 async fn upload_s3_data(
    client: &Arc<GenericRemoteStorage>,
    base_prefix_str: &'static str,
    upload_tasks_count: usize,
 ) -> ControlFlow<Uploads, Uploads> {
    info!("Creating {upload_tasks_count} S3 files");
    let mut upload_tasks = JoinSet::new();
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
            let blob_prefix = RemotePath::new(&prefix)
                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");
            let data = format!("remote blob data {i}").into_bytes();
            let data_len = data.len();
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
    }
    let mut upload_tasks_failed = false;
    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
    while let Some(task_run_result) = upload_tasks.join_next().await {
        match task_run_result
            .context("task join failed")
            .and_then(|task_result| task_result.context("upload task failed"))
        {
            Ok((upload_prefix, upload_path)) => {
                uploaded_prefixes.insert(upload_prefix);
                uploaded_blobs.insert(upload_path);
            }
            Err(e) => {
                error!("Upload task failed: {e:?}");
                upload_tasks_failed = true;
            }
        }
    }
    let uploads = Uploads {
        prefixes: uploaded_prefixes,
        blobs: uploaded_blobs,
    };
    if upload_tasks_failed {
        ControlFlow::Break(uploads)
    } else {
        ControlFlow::Continue(uploads)
    }
 }
 async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
    info!(
        "Removing {} objects from the remote storage during cleanup",
        objects_to_delete.len()
    );
    let mut delete_tasks = JoinSet::new();
    for object_to_delete in objects_to_delete {
        let task_client = Arc::clone(client);
        delete_tasks.spawn(async move {
            debug!("Deleting remote item at path {object_to_delete:?}");
            task_client
                .delete(&object_to_delete)
                .await
                .with_context(|| format!("{object_to_delete:?} removal"))
        });
    }
    while let Some(task_run_result) = delete_tasks.join_next().await {
        match task_run_result {
            Ok(task_result) => match task_result {
                Ok(()) => {}
                Err(e) => error!("Delete task failed: {e:?}"),
            },
            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
        }
    }
 }
 // Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
 async fn upload_simple_s3_data(
    client: &Arc<GenericRemoteStorage>,
    upload_tasks_count: usize,
 ) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
    info!("Creating {upload_tasks_count} S3 files");
    let mut upload_tasks = JoinSet::new();
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
            let blob_path = RemotePath::new(&blob_path)
                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");
            let data = format!("remote blob data {i}").into_bytes();
            let data_len = data.len();
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>(blob_path)
        });
    }
    let mut upload_tasks_failed = false;
    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
    while let Some(task_run_result) = upload_tasks.join_next().await {
        match task_run_result
            .context("task join failed")
            .and_then(|task_result| task_result.context("upload task failed"))
        {
            Ok(upload_path) => {
                uploaded_blobs.insert(upload_path);
            }
            Err(e) => {
                error!("Upload task failed: {e:?}");
                upload_tasks_failed = true;
            }
        }
    }
    if upload_tasks_failed {
        ControlFlow::Break(uploaded_blobs)
    } else {
        ControlFlow::Continue(uploaded_blobs)
    }
 }
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -1,6 +1,8 @@
 /// Extensions to `std::fs` types.
 use std::{fs, io, path::Path};
 use anyhow::Context;
 pub trait PathExt {
    /// Returns an error if `self` is not a directory.
    fn is_empty_dir(&self) -> io::Result<bool>;
@@ -15,10 +17,19 @@ where
    }
 }
 pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
    let mut dir = tokio::fs::read_dir(&path)
        .await
        .context(format!("read_dir({})", path.as_ref().display()))?;
    Ok(dir.next_entry().await?.is_none())
 }
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;
    use crate::fs_ext::is_directory_empty;
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -42,4 +53,26 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(file_path.is_empty_dir().is_err());
    }
    #[tokio::test]
    async fn is_empty_dir_async() {
        let dir = tempfile::tempdir().unwrap();
        let dir_path = dir.path();
        // test positive case
        assert!(
            is_directory_empty(dir_path).await.expect("test failure"),
            "new tempdir should be empty"
        );
        // invoke on a file to ensure it returns an error
        let file_path: PathBuf = dir_path.join("testfile");
        let f = std::fs::File::create(&file_path).unwrap();
        drop(f);
        assert!(is_directory_empty(&file_path).await.is_err());
        // do it again on a path, we know to be nonexistent
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,19 +1,18 @@
 use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
-use anyhow::{anyhow, Context};
+use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
 use hyper::Method;
-use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
+use hyper::{header::CONTENT_TYPE, Body, Request, Response};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::net::TcpListener;
 use std::str::FromStr;
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -348,40 +347,6 @@ pub fn check_permission_with(
    }
 }
 ///
 /// Start listening for HTTP requests on given socket.
 ///
 /// 'shutdown_future' can be used to stop. If the Future becomes
 /// ready, we stop listening for new requests, and the function returns.
 ///
 pub fn serve_thread_main<S>(
    router_builder: RouterBuilder<hyper::Body, ApiError>,
    listener: TcpListener,
    shutdown_future: S,
 ) -> anyhow::Result<()>
 where
    S: Future<Output = ()> + Send + Sync,
 {
    info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
    // Create a Service from the router above to handle incoming requests.
    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
    // Enter a single-threaded tokio runtime bound to the current thread
    let runtime = tokio::runtime::Builder::new_current_thread()
        .enable_all()
        .build()?;
    let _guard = runtime.enter();
    let server = Server::from_tcp(listener)?
        .serve(service)
        .with_graceful_shutdown(shutdown_future);
    runtime.block_on(server)?;
    Ok(())
 }
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,5 +1,6 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
 use std::error::Error as StdError;
 use thiserror::Error;
 use tracing::error;
@@ -15,13 +16,13 @@ pub enum ApiError {
    Unauthorized(String),
    #[error("NotFound: {0}")]
-    NotFound(anyhow::Error),
+    NotFound(Box<dyn StdError + Send + Sync + 'static>),
    #[error("Conflict: {0}")]
    Conflict(String),
    #[error("Precondition failed: {0}")]
-    PreconditionFailed(&'static str),
+    PreconditionFailed(Box<str>),
    #[error(transparent)]
    InternalServerError(anyhow::Error),
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
-        updates.insert_historic(Arc::new(layer));
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
    }
    println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
            is_incremental: false,
            short_id: format!("Layer {}", i),
        };
-        updates.insert_historic(Arc::new(layer));
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -335,31 +335,116 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;
-    // All tenant load operations carry this while they are ongoing; it will be dropped once those
+    // Startup staging or optimizing:
-    // operations finish either successfully or in some other manner. However, the initial load
+    //
-    // will be then done, and we can start the global background tasks.
+    // We want to minimize downtime for `page_service` connections, and trying not to overload
    // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
    //
    // init_done_rx will notify when all initial load operations have completed.
    //
    // background_jobs_can_start (same name used to hold off background jobs from starting at
    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
    // (background_task_maximum_delay).
    let (init_done_tx, init_done_rx) = utils::completion::channel();
    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
    let order = pageserver::InitializationOrder {
        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: init_logical_size_done_tx,
        background_jobs_can_start: background_jobs_barrier.clone(),
    };
    // Scan the local 'tenants/' directory and start loading the tenants
    let init_started_at = std::time::Instant::now();
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        broker_client.clone(),
        remote_storage.clone(),
-        (init_done_tx, init_done_rx.clone()),
+        order,
    ))?;
    BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx.clone();
+        let init_done_rx = init_done_rx;
-        async move {
+        let shutdown_pageserver = shutdown_pageserver.clone();
-            init_done_rx.wait().await;
+        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
-            let elapsed = init_started_at.elapsed();
+            init_done_rx.wait().await;
            // initial logical sizes can now start, as they were waiting on init_done_rx.
            scopeguard::ScopeGuard::into_inner(guard);
            let init_done = std::time::Instant::now();
            let elapsed = init_done - init_started_at;
            tracing::info!(
                elapsed_millis = elapsed.as_millis(),
-                "Initial load completed."
+                "Initial load completed"
            );
            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
            let timeout = conf.background_task_maximum_delay;
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
            let init_sizes_done = tokio::select! {
                _ = &mut init_sizes_done => {
                    let now = std::time::Instant::now();
                    tracing::info!(
                        from_init_done_millis = (now - init_done).as_millis(),
                        from_init_millis = (now - init_started_at).as_millis(),
                        "Initial logical sizes completed"
                    );
                    None
                }
                _ = tokio::time::sleep(timeout) => {
                    tracing::info!(
                        timeout_millis = timeout.as_millis(),
                        "Initial logical size timeout elapsed; starting background jobs"
                    );
                    Some(init_sizes_done)
                }
            };
            scopeguard::ScopeGuard::into_inner(guard);
            // allow background jobs to start
            drop(background_jobs_can_start);
            if let Some(init_sizes_done) = init_sizes_done {
                // ending up here is not a bug; at the latest logical sizes will be queried by
                // consumption metrics.
                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
                init_sizes_done.await;
                scopeguard::ScopeGuard::into_inner(guard);
                let now = std::time::Instant::now();
                tracing::info!(
                    from_init_done_millis = (now - init_done).as_millis(),
                    from_init_millis = (now - init_started_at).as_millis(),
                    "Initial logical sizes completed after timeout (background jobs already started)"
                );
            }
        };
        async move {
            let mut drive_init = std::pin::pin!(drive_init);
            // just race these tasks
            tokio::select! {
                _ = shutdown_pageserver.cancelled() => {},
                _ = &mut drive_init => {},
            }
        }
    });
@@ -374,7 +459,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
-            init_done_rx.clone(),
+            background_jobs_barrier.clone(),
        )?;
    }
@@ -410,45 +495,50 @@ fn start_pageserver(
                Ok(())
            },
        );
    }
-        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let init_done_rx = init_done_rx;
+        let background_jobs_barrier = background_jobs_barrier;
-            let metrics_ctx = RequestContext::todo_child(
+        let metrics_ctx = RequestContext::todo_child(
-                TaskKind::MetricsCollection,
+            TaskKind::MetricsCollection,
-                // This task itself shouldn't download anything.
+            // This task itself shouldn't download anything.
-                // The actual size calculation does need downloads, and
+            // The actual size calculation does need downloads, and
-                // creates a child context with the right DownloadBehavior.
+            // creates a child context with the right DownloadBehavior.
-                DownloadBehavior::Error,
+            DownloadBehavior::Error,
-            );
+        );
-            task_mgr::spawn(
+        task_mgr::spawn(
-                MGMT_REQUEST_RUNTIME.handle(),
+            crate::BACKGROUND_RUNTIME.handle(),
-                TaskKind::MetricsCollection,
+            TaskKind::MetricsCollection,
-                None,
+            None,
-                None,
+            None,
-                "consumption metrics collection",
+            "consumption metrics collection",
-                true,
+            true,
-                async move {
+            async move {
-                    // first wait for initial load to complete before first iteration.
+                // first wait until background jobs are cleared to launch.
-                    //
+                //
-                    // this is because we only process active tenants and timelines, and the
+                // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
+                // which will not be rate-limited.
-                    init_done_rx.wait().await;
+                let cancel = task_mgr::shutdown_token();
-                    pageserver::consumption_metrics::collect_metrics(
+                tokio::select! {
-                        metric_collection_endpoint,
+                    _ = cancel.cancelled() => { return Ok(()); },
-                        conf.metric_collection_interval,
+                    _ = background_jobs_barrier.wait() => {}
-                        conf.cached_metric_collection_interval,
+                };
-                        conf.synthetic_size_calculation_interval,
+
-                        conf.id,
+                pageserver::consumption_metrics::collect_metrics(
-                        metrics_ctx,
+                    metric_collection_endpoint,
-                    )
+                    conf.metric_collection_interval,
-                    .instrument(info_span!("metrics_collection"))
+                    conf.cached_metric_collection_interval,
-                    .await?;
+                    conf.synthetic_size_calculation_interval,
-                    Ok(())
+                    conf.id,
-                },
+                    metrics_ctx,
-            );
+                )
-        }
+                .instrument(info_span!("metrics_collection"))
                .await?;
                Ok(())
            },
        );
    }
    // Spawn a task to listen for libpq connections. It will spawn further tasks
@@ -483,6 +573,8 @@ fn start_pageserver(
        );
    }
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
    // All started up! Now just sit and wait for shutdown signal.
    ShutdownSignals::handle(|signal| match signal {
        Signal::Quit => {
@@ -498,6 +590,11 @@ fn start_pageserver(
                "Got {}. Terminating gracefully in fast shutdown mode",
                signal.name()
            );
            // This cancels the `shutdown_pageserver` cancellation tree.
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
            unreachable!()
        }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -63,6 +63,7 @@ pub mod defaults {
    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
    ///
    /// Default built-in configuration file.
@@ -91,15 +92,16 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
-# [tenant_config]
+#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
 #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
-#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
+#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}
@@ -109,7 +111,8 @@ pub mod defaults {
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false
-# [remote_storage]
+
 [remote_storage]
 "###
    );
@@ -187,6 +190,15 @@ pub struct PageServerConf {
    pub test_remote_failures: u64,
    pub ondemand_download_behavior_treat_error_as_warn: bool,
    /// How long will background tasks be delayed at most after initial load of tenants.
    ///
    /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
    /// as we now isolate initial loading, initial logical size calculation and background tasks.
    /// Smaller nodes will have background tasks "not running" for this long unless every timeline
    /// has it's initial logical size calculated. Not running background tasks for some seconds is
    /// not terrible.
    pub background_task_maximum_delay: Duration,
 }
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -259,6 +271,8 @@ struct PageServerConfigBuilder {
    test_remote_failures: BuilderValue<u64>,
    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
    background_task_maximum_delay: BuilderValue<Duration>,
 }
 impl Default for PageServerConfigBuilder {
@@ -316,6 +330,11 @@ impl Default for PageServerConfigBuilder {
            test_remote_failures: Set(0),
            ondemand_download_behavior_treat_error_as_warn: Set(false),
            background_task_maximum_delay: Set(humantime::parse_duration(
                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
            )
            .unwrap()),
        }
    }
 }
@@ -440,6 +459,10 @@ impl PageServerConfigBuilder {
            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
    }
    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
        self.background_task_maximum_delay = BuilderValue::Set(delay);
    }
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -522,6 +545,9 @@ impl PageServerConfigBuilder {
                .ok_or(anyhow!(
                    "missing ondemand_download_behavior_treat_error_as_warn"
                ))?,
            background_task_maximum_delay: self
                .background_task_maximum_delay
                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
        })
    }
 }
@@ -710,6 +736,7 @@ impl PageServerConf {
                    )
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -877,6 +904,7 @@ impl PageServerConf {
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
            background_task_maximum_delay: Duration::ZERO,
        }
    }
 }
@@ -1036,6 +1064,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
 log_format = 'json'
 background_task_maximum_delay = '334 s'
 "#;
@@ -1094,6 +1123,9 @@ log_format = 'json'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: humantime::parse_duration(
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1148,6 +1180,7 @@ log_format = 'json'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -83,7 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
-    init_done: completion::Barrier,
+    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
@@ -100,17 +100,16 @@ pub fn launch_disk_usage_global_eviction_task(
        "disk usage based eviction",
        false,
        async move {
-            // wait until initial load is complete, because we cannot evict from loading tenants.
+            let cancel = task_mgr::shutdown_token();
            init_done.wait().await;
-            disk_usage_eviction_task(
+            // wait until initial load is complete, because we cannot evict from loading tenants.
-                &state,
+            tokio::select! {
-                task_config,
+                _ = cancel.cancelled() => { return Ok(()); },
-                storage,
+                _ = background_jobs_barrier.wait() => { }
-                &conf.tenants_path(),
+            };
-                task_mgr::shutdown_token(),
+
-            )
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
-            .await;
+                .await;
            info!("disk usage based eviction task finishing");
            Ok(())
        },
@@ -517,7 +516,7 @@ async fn collect_eviction_candidates(
            if !tl.is_active() {
                continue;
            }
-            let info = tl.get_local_layers_for_disk_usage_eviction();
+            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -186,10 +186,8 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
    delete:
-      description: "Attempts to delete specified timeline. On 500 errors should be retried"
+      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
        "200":
          description: Ok
        "400":
          description: Error when no tenant id found in path or no timeline id
          content:
@@ -214,8 +212,14 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
          description: Deletion is already in progress, continue polling
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
        "412":
-          description: Tenant is missing
+          description: Tenant is missing, or timeline has children
          content:
            application/json:
              schema:
@@ -386,6 +390,7 @@ paths:
        "202":
          description: Tenant attaching scheduled
        "400":
          description: Bad Request
          content:
            application/json:
              schema:
@@ -928,12 +933,28 @@ components:
              writing to the tenant's S3 state, so, DO NOT ATTACH the
              tenant to any other pageserver, or we risk split-brain.
            - `attached` means that the attach operation has completed,
-              maybe successfully, maybe not. Perform a health check at
+              successfully
-              the Postgres level to determine healthiness of the tenant.
+            - `failed` means that attach has failed. For reason check corresponding `reason` failed.
              `failed` is the terminal state, retrying attach call wont resolve the issue.
              For example this can be caused by s3 being unreachable. The retry may be implemented
              with call to detach, though it would be better to not automate it and inspec failed state
              manually before proceeding with a retry.
            See the tenant `/attach` endpoint for more information.
-          type: string
+          type: object
-          enum: [ "maybe", "attached" ]
+          required:
            - slug
            - data
          properties:
            slug:
              type: string
              enum: [ "maybe", "attached", "failed" ]
            data:
              type: object
              properties:
                reason:
                  type: string
    TenantCreateRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -142,7 +142,7 @@ impl From<TenantMapInsertError> for ApiError {
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
@@ -151,7 +151,7 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            e @ GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
@@ -169,7 +169,7 @@ impl From<SetNewTenantConfigError> for ApiError {
    fn from(e: SetNewTenantConfigError) -> ApiError {
        match e {
            SetNewTenantConfigError::GetTenant(tid) => {
-                ApiError::NotFound(anyhow!("tenant {}", tid))
+                ApiError::NotFound(anyhow!("tenant {}", tid).into())
            }
            e @ SetNewTenantConfigError::Persist(_) => {
                ApiError::InternalServerError(anyhow::Error::new(e))
@@ -182,10 +182,12 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
        match value {
-            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
+            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
-            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
+            HasChildren(children) => ApiError::PreconditionFailed(
-                "Cannot delete timeline which has child timelines"
+                format!("Cannot delete timeline which has child timelines: {children:?}")
-            )),
+                    .into_boxed_str(),
            ),
            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -197,9 +199,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
        match value {
            // Report Precondition failed so client can distinguish between
            // "tenant is missing" case from "timeline is missing"
-            Tenant(GetTenantError::NotFound(..)) => {
+            Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
-                ApiError::PreconditionFailed("Requested tenant is missing")
+                "Requested tenant is missing".to_owned().into_boxed_str(),
-            }
+            ),
            Tenant(t) => ApiError::from(t),
            Timeline(t) => ApiError::from(t),
        }
@@ -214,7 +216,7 @@ async fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
-    let mut info = build_timeline_info_common(timeline, ctx)?;
+    let mut info = build_timeline_info_common(timeline, ctx).await?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -232,7 +234,7 @@ async fn build_timeline_info(
    Ok(info)
 }
-fn build_timeline_info_common(
+async fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
@@ -263,7 +265,7 @@ fn build_timeline_info_common(
            None
        }
    };
-    let current_physical_size = Some(timeline.layer_size_sum());
+    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
@@ -329,6 +331,7 @@ async fn timeline_create_handler(
            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
@@ -395,7 +398,7 @@ async fn timeline_detail_handler(
        let timeline = tenant
            .get_timeline(timeline_id, false)
-            .map_err(ApiError::NotFound)?;
+            .map_err(|e| ApiError::NotFound(e.into()))?;
        let timeline_info = build_timeline_info(
            &timeline,
@@ -494,7 +497,8 @@ async fn timeline_delete_handler(
        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
        .await?;
-    json_response(StatusCode::OK, ())
+    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
    json_response(StatusCode::ACCEPTED, ())
 }
 async fn tenant_detach_handler(
@@ -589,7 +593,7 @@ async fn tenant_status(
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.layer_size_sum();
+            current_physical_size += timeline.layer_size_sum().await;
        }
        let state = tenant.current_state();
@@ -699,7 +703,7 @@ async fn layer_map_info_handler(
    check_permission(&request, Some(tenant_id))?;
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let layer_map_info = timeline.layer_map_info(reset);
+    let layer_map_info = timeline.layer_map_info(reset).await;
    json_response(StatusCode::OK, layer_map_info)
 }
@@ -1058,7 +1062,7 @@ async fn timeline_download_remote_layers_handler_get(
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    json_response(StatusCode::OK, info)
 }
@@ -1069,7 +1073,7 @@ async fn active_timeline_of_active_tenant(
    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)
+        .map_err(|e| ApiError::NotFound(e.into()))
 }
 async fn always_panic_handler(
@@ -1125,8 +1129,6 @@ async fn disk_usage_eviction_run(
        freed_bytes: 0,
    };
    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
    let (tx, rx) = tokio::sync::oneshot::channel();
    let state = get_state(&r);
@@ -1144,7 +1146,7 @@ async fn disk_usage_eviction_run(
    let _g = cancel.drop_guard();
    crate::task_mgr::spawn(
-        MGMT_REQUEST_RUNTIME.handle(),
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush()?;
+            modification.flush().await?;
        }
    }
    // We're done importing all the data files.
-    modification.commit()?;
+    modification.commit().await?;
    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -148,17 +148,17 @@ async fn import_rel(
    // because there is no guarantee about the order in which we are processing segments.
    // ignore "relation already exists" error
    //
-    // FIXME: use proper error type for this, instead of parsing the error message.
+    // FIXME: Keep track of which relations we've already created?
    // Or better yet, keep track of which relations we've already created
    // https://github.com/neondatabase/neon/issues/3309
    if let Err(e) = modification
        .put_rel_creation(rel, nblocks as u32, ctx)
        .await
    {
-        if e.to_string().contains("already exists") {
+        match e {
-            debug!("relation {} already exists. we must be extending it", rel);
+            RelationError::AlreadyExists => {
-        } else {
+                debug!("Relation {} already exist. We must be extending it.", rel)
-            return Err(e);
+            }
            _ => return Err(e.into()),
        }
    }
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush()?;
+                modification.flush().await?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;
-    modification.commit()?;
+    modification.commit().await?;
    Ok(())
 }
@@ -594,7 +594,7 @@ async fn import_file(
        // zenith.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
-        let writer = modification.tline.writer();
+        let writer = modification.tline.writer().await;
        writer.finish_write(prev_lsn);
        debug!("imported zenith signal {}", prev_lsn);
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -58,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
    // the checkpoint and GC tasks.
    tenant::mgr::shutdown_all_tenants().await;
    // Stop syncing with remote storage.
    //
    // FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
    // Should it?
    task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
@@ -138,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool {
    }
 }
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
 /// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
 /// delaying is needed.
 #[derive(Clone)]
 pub struct InitializationOrder {
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,
    /// Barrier for when we can start initial logical size calculations.
    pub initial_logical_size_can_start: utils::completion::Barrier,
    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
    pub initial_logical_size_attempt: utils::completion::Completion,
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
    pub background_jobs_can_start: utils::completion::Barrier,
 }
 #[cfg(test)]
 mod backoff_defaults_tests {
    use super::*;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,4 +1,4 @@
-use metrics::core::{AtomicU64, GenericCounter};
+use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
@@ -95,21 +95,19 @@ static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
 });
 // Metrics collected on operations on the storage repository.
-static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram_vec!(
+    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
-        "Time spent in reconstruct_value",
+        "Time spent in reconstruct_value (reconstruct a page from deltas)",
        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
-static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter_vec!(
+    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -124,11 +122,10 @@ static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
-static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter_vec!(
+    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -428,6 +425,27 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
 pub struct BasebackupQueryTime(HistogramVec);
 pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
        register_histogram_vec!(
            "pageserver_basebackup_query_seconds",
            "Histogram of basebackup queries durations, by result type",
            &["result"],
            CRITICAL_OP_BUCKETS.into(),
        )
        .expect("failed to define a metric")
    })
 });
 impl DurationResultObserver for BasebackupQueryTime {
    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
        let label_value = if res.is_ok() { "ok" } else { "error" };
        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
        metric.observe(duration.as_secs_f64());
    }
 }
 pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
@@ -752,10 +770,7 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
    pub reconstruct_time_histo: Histogram,
    pub get_reconstruct_data_time_histo: Histogram,
    pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
    pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -783,15 +798,9 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
        let reconstruct_time_histo = RECONSTRUCT_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -833,19 +842,13 @@ impl TimelineMetrics {
        let read_num_fs_layers = READ_NUM_FS_LAYERS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
        TimelineMetrics {
            tenant_id,
            timeline_id,
            reconstruct_time_histo,
            get_reconstruct_data_time_histo,
            materialized_page_cache_hit_counter,
            materialized_page_cache_hit_upon_request_counter,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -872,10 +875,7 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -1319,4 +1319,8 @@ pub fn preinitialize_metrics() {
    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
    // Python tests need these.
    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -390,7 +390,9 @@ impl PageServerHandler {
        };
        // Check that the timeline exists
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
            .get_timeline(timeline_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
@@ -911,10 +913,24 @@ where
                None
            };
-            // Check that the timeline exists
+            metrics::metric_vec_duration::observe_async_block_duration_by_result(
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
+                &*crate::metrics::BASEBACKUP_QUERY_TIME,
-                .await?;
+                async move {
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                    self.handle_basebackup_request(
                        pgb,
                        tenant_id,
                        timeline_id,
                        lsn,
                        None,
                        false,
                        ctx,
                    )
                    .await?;
                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
                    anyhow::Ok(())
                },
            )
            .await?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
@@ -1230,6 +1246,6 @@ async fn get_active_tenant_timeline(
        .map_err(GetActiveTimelineError::Tenant)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(GetActiveTimelineError::Timeline)?;
+        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
    Ok(timeline)
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -43,6 +43,16 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
    #[error("Relation Already Exists")]
    AlreadyExists,
    #[error("invalid relnode")]
    InvalidRelnode,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -101,9 +111,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
+            return Err(PageReconstructError::Other(
-                "invalid relnode"
+                RelationError::InvalidRelnode.into(),
-            )));
+            ));
        }
        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
@@ -148,9 +158,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
+            return Err(PageReconstructError::Other(
-                "invalid relnode"
+                RelationError::InvalidRelnode.into(),
-            )));
+            ));
        }
        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -193,9 +203,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(anyhow::anyhow!(
+            return Err(PageReconstructError::Other(
-                "invalid relnode"
+                RelationError::InvalidRelnode.into(),
-            )));
+            ));
        }
        // first try to lookup relation in cache
@@ -699,6 +709,20 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }
    #[cfg(test)]
    pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
        self.init_empty()?;
        self.put_control_file(bytes::Bytes::from_static(
            b"control_file contents do not matter",
        ))
        .context("put_control_file")?;
        self.put_checkpoint(bytes::Bytes::from_static(
            b"checkpoint_file contents do not matter",
        ))
        .context("put_checkpoint_file")?;
        Ok(())
    }
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -710,7 +734,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        rec: NeonWalRecord,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -737,7 +761,7 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -861,32 +885,38 @@ impl<'a> DatadirModification<'a> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), RelationError> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        if rel.relnode == 0 {
            return Err(RelationError::AlreadyExists);
        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
            .context("deserialize db")?;
        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
            // Didn't exist. Update dbdir
            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
-            let buf = DbDirectory::ser(&dbdir)?;
+            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
            // and create the RelDirectory
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
                .context("deserialize db")?
        };
        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            anyhow::bail!("rel {rel} already exists");
+            return Err(RelationError::AlreadyExists);
        }
        self.put(
            rel_dir_key,
-            Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
+            Value::Image(Bytes::from(
                RelDirectory::ser(&rel_dir).context("serialize")?,
            )),
        );
        // Put size
@@ -911,7 +941,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        let last_lsn = self.tline.get_last_record_lsn();
        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
            let size_key = rel_size_to_key(rel);
@@ -942,7 +972,7 @@ impl<'a> DatadirModification<'a> {
        nblocks: BlockNumber,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        // Put size
        let size_key = rel_size_to_key(rel);
@@ -963,7 +993,7 @@ impl<'a> DatadirModification<'a> {
    /// Drop a relation.
    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        // Remove it from the directory entry
        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
@@ -1108,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub fn flush(&mut self) -> anyhow::Result<()> {
+    pub async fn flush(&mut self) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1116,19 +1146,20 @@ impl<'a> DatadirModification<'a> {
            return Ok(());
        }
-        let writer = self.tline.writer();
+        let writer = self.tline.writer().await;
        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: anyhow::Result<()> = Ok(());
+        let mut retained_pending_updates = HashMap::new();
-        self.pending_updates.retain(|&key, value| {
+        for (key, value) in self.pending_updates.drain() {
-            if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
+            if is_rel_block_key(key) || is_slru_block_key(key) {
-                result = writer.put(key, self.lsn, value);
+                // This bails out on first error without modifying pending_updates.
-                false
+                // That's Ok, cf this function's doc comment.
                writer.put(key, self.lsn, &value).await?;
            } else {
-                true
+                retained_pending_updates.insert(key, value);
            }
-        });
+        }
-        result?;
+        self.pending_updates.extend(retained_pending_updates);
        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1143,17 +1174,17 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub fn commit(&mut self) -> anyhow::Result<()> {
+    pub async fn commit(&mut self) -> anyhow::Result<()> {
-        let writer = self.tline.writer();
+        let writer = self.tline.writer().await;
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;
        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value)?;
+            writer.put(key, lsn, &value).await?;
        }
        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn)?;
+            writer.delete(key_range, lsn).await?;
        }
        writer.finish_write(lsn);
@@ -1593,20 +1624,6 @@ fn is_slru_block_key(key: Key) -> bool {
        && key.field6 != 0xffffffff // and not SlruSegSize
 }
 #[cfg(test)]
 pub fn create_test_timeline(
    tenant: &crate::tenant::Tenant,
    timeline_id: utils::id::TimelineId,
    pg_version: u32,
    ctx: &RequestContext,
 ) -> anyhow::Result<std::sync::Arc<Timeline>> {
    let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
    let mut m = tline.begin_modification(Lsn(8));
    m.init_empty()?;
    m.commit()?;
    Ok(tline)
 }
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -257,6 +257,9 @@ pub enum TaskKind {
    // task that handles attaching a tenant
    Attach,
    // Used mostly for background deletion from s3
    TimelineDeletionWorker,
    // task that handhes metrics collection
    MetricsCollection,
@@ -476,18 +479,35 @@ pub async fn shutdown_tasks(
                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
                task.cancel.cancel();
-                victim_tasks.push(Arc::clone(task));
+                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
                    task_mut.tenant_id,
                    task_mut.timeline_id,
                ));
            }
        }
    }
-    for task in victim_tasks {
+    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
        let join_handle = {
            let mut task_mut = task.mutable.lock().unwrap();
            task_mut.join_handle.take()
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
                if tenant_id.is_none() {
                    // there are quite few of these
                    info!(name = task.name, kind = ?task_kind, "stopping global task");
                } else {
                    // warn to catch these in tests; there shouldn't be any
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
            let completed = tokio::select! {
                biased;
                _ = &mut join_handle => { true },
                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                    // allow some time to elapse before logging to cut down the number of log
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -38,8 +38,8 @@ pub mod defaults {
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
 use anyhow::Context;
 use anyhow::Result;
 use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -61,6 +63,8 @@ use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::Replacement;
 use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
 use super::storage_layer::PersistentLayerKey;
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -86,11 +90,16 @@ pub struct LayerMap<L: ?Sized> {
    pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
    /// Index of the historic layers optimized for search
-    historic: BufferedHistoricLayerCoverage<Arc<L>>,
+    historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
    /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
    /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
-    l0_delta_layers: Vec<Arc<L>>,
+    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
    /// RemoteLayer will be removed.
    mapping: HashMap<PersistentLayerKey, Arc<L>>,
 }
 impl<L: ?Sized> Default for LayerMap<L> {
@@ -101,6 +110,7 @@ impl<L: ?Sized> Default for LayerMap<L> {
            frozen_layers: VecDeque::default(),
            l0_delta_layers: Vec::default(),
            historic: BufferedHistoricLayerCoverage::default(),
            mapping: HashMap::default(),
        }
    }
 }
@@ -125,8 +135,9 @@ where
    ///
    /// Insert an on-disk layer.
    ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) {
+    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
-        self.layer_map.insert_historic_noflush(layer)
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
        self.layer_map.insert_historic_noflush(layer_desc, layer)
    }
    ///
@@ -134,8 +145,8 @@ where
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer: Arc<L>) {
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
-        self.layer_map.remove_historic_noflush(layer)
+        self.layer_map.remove_historic_noflush(layer_desc, layer)
    }
    /// Replaces existing layer iff it is the `expected`.
@@ -150,12 +161,15 @@ where
    ///      that we can replace values only by updating a hashmap.
    pub fn replace_historic(
        &mut self,
        expected_desc: PersistentLayerDesc,
        expected: &Arc<L>,
        new_desc: PersistentLayerDesc,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
-        self.layer_map.replace_historic_noflush(expected, new)
+        self.layer_map
            .replace_historic_noflush(expected_desc, expected, new_desc, new)
    }
    // We will flush on drop anyway, but this method makes it
@@ -230,6 +244,7 @@ where
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
                let image = self.get_layer_from_mapping(&image.key()).clone();
                Some(SearchResult {
                    layer: image,
                    lsn_floor,
@@ -237,6 +252,7 @@ where
            }
            (Some(delta), None) => {
                let lsn_floor = delta.get_lsn_range().start;
                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                Some(SearchResult {
                    layer: delta,
                    lsn_floor,
@@ -247,6 +263,7 @@ where
                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                let image_exact_match = img_lsn + 1 == end_lsn;
                if image_is_newer || image_exact_match {
                    let image = self.get_layer_from_mapping(&image.key()).clone();
                    Some(SearchResult {
                        layer: image,
                        lsn_floor: img_lsn,
@@ -254,6 +271,7 @@ where
                } else {
                    let lsn_floor =
                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                    Some(SearchResult {
                        layer: delta,
                        lsn_floor,
@@ -273,16 +291,33 @@ where
    ///
    /// Helper function for BatchedUpdates::insert_historic
    ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+    /// TODO(chi): remove L generic so that we do not need to pass layer object.
    pub(self) fn insert_historic_noflush(
        &mut self,
        layer_desc: PersistentLayerDesc,
        layer: Arc<L>,
    ) {
        self.mapping.insert(layer_desc.key(), layer.clone());
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
        self.historic.insert(
            historic_layer_coverage::LayerKey::from(&*layer),
            Arc::clone(&layer),
        );
        if Self::is_l0(&layer) {
-            self.l0_delta_layers.push(layer);
+            self.l0_delta_layers.push(layer_desc.clone().into());
        }
        self.historic.insert(
            historic_layer_coverage::LayerKey::from(&*layer),
            layer_desc.into(),
        );
    }
    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
        let layer = self
            .mapping
            .get(key)
            .with_context(|| format!("{key:?}"))
            .expect("inconsistent layer mapping");
        layer
    }
    ///
@@ -290,14 +325,16 @@ where
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(&*layer));
        if Self::is_l0(&layer) {
            let len_before = self.l0_delta_layers.len();
-            self.l0_delta_layers
+            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
-                .retain(|other| !Self::compare_arced_layers(other, &layer));
+            l0_delta_layers.retain(|other| {
                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
            });
            self.l0_delta_layers = l0_delta_layers;
            // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
            // there's a chance that the comparison fails at runtime due to it comparing (pointer,
            // vtable) pairs.
@@ -307,11 +344,14 @@ where
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
        self.mapping.remove(&layer_desc.key());
    }
    pub(self) fn replace_historic_noflush(
        &mut self,
        expected_desc: PersistentLayerDesc,
        expected: &Arc<L>,
        new_desc: PersistentLayerDesc,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
        let key = historic_layer_coverage::LayerKey::from(&**expected);
@@ -332,10 +372,9 @@ where
        let l0_index = if expected_l0 {
            // find the index in case replace worked, we need to replace that as well
-            let pos = self
+            let pos = self.l0_delta_layers.iter().position(|slot| {
-                .l0_delta_layers
+                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
-                .iter()
+            });
                .position(|slot| Self::compare_arced_layers(slot, expected));
            if pos.is_none() {
                return Ok(Replacement::NotFound);
@@ -345,16 +384,28 @@ where
            None
        };
-        let replaced = self.historic.replace(&key, new.clone(), |existing| {
+        let new_desc = Arc::new(new_desc);
-            Self::compare_arced_layers(existing, expected)
+        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
            **existing == expected_desc
        });
        if let Replacement::Replaced { .. } = &replaced {
            self.mapping.remove(&expected_desc.key());
            self.mapping.insert(new_desc.key(), new);
            if let Some(index) = l0_index {
-                self.l0_delta_layers[index] = new;
+                self.l0_delta_layers[index] = new_desc;
            }
        }
        let replaced = match replaced {
            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
            Replacement::NotFound => Replacement::NotFound,
            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
            Replacement::Unexpected(x) => {
                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
            }
        };
        Ok(replaced)
    }
@@ -383,7 +434,7 @@ where
        let start = key.start.to_i128();
        let end = key.end.to_i128();
-        let layer_covers = |layer: Option<Arc<L>>| match layer {
+        let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
            Some(layer) => layer.get_lsn_range().start >= lsn.start,
            None => false,
        };
@@ -404,7 +455,9 @@ where
    }
    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic.iter()
+        self.historic
            .iter()
            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
    }
    ///
@@ -436,14 +489,24 @@ where
        // Loop through the change events and push intervals
        for (change_key, change_val) in version.image_coverage.range(start..end) {
            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((kr, current_val.take()));
+            coverage.push((
                kr,
                current_val
                    .take()
                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
            ));
            current_key = change_key;
            current_val = change_val.clone();
        }
        // Add the final interval
        let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((kr, current_val.take()));
+        coverage.push((
            kr,
            current_val
                .take()
                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
        ));
        Ok(coverage)
    }
@@ -532,7 +595,9 @@ where
                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                    let lr = lsn.start..val.get_lsn_range().start;
                    if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let base_count =
                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
                                as usize;
                        let new_limit = limit.map(|l| l - base_count);
                        let max_stacked_deltas_underneath =
                            self.count_deltas(&kr, &lr, new_limit)?;
@@ -555,7 +620,9 @@ where
                let lr = lsn.start..val.get_lsn_range().start;
                if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let base_count =
                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
                            as usize;
                    let new_limit = limit.map(|l| l - base_count);
                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                    max_stacked_deltas = std::cmp::max(
@@ -706,7 +773,11 @@ where
    /// Return all L0 delta layers
    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
-        Ok(self.l0_delta_layers.clone())
+        Ok(self
            .l0_delta_layers
            .iter()
            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
            .collect())
    }
    /// debugging function to print out the contents of the layer map
@@ -809,12 +880,17 @@ mod tests {
            let layer = LayerDescriptor::from(layer);
            // same skeletan construction; see scenario below
-            let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
+            let not_found = Arc::new(layer.clone());
-            let new_version: Arc<dyn Layer> = Arc::new(layer);
+            let new_version = Arc::new(layer);
            let mut map = LayerMap::default();
-            let res = map.batch_update().replace_historic(&not_found, new_version);
+            let res = map.batch_update().replace_historic(
                not_found.get_persistent_layer_desc(),
                &not_found,
                new_version.get_persistent_layer_desc(),
                new_version,
            );
            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
        }
@@ -823,8 +899,8 @@ mod tests {
            let name = LayerFileName::from_str(layer_name).unwrap();
            let skeleton = LayerDescriptor::from(name);
-            let remote: Arc<dyn Layer> = Arc::new(skeleton.clone());
+            let remote = Arc::new(skeleton.clone());
-            let downloaded: Arc<dyn Layer> = Arc::new(skeleton);
+            let downloaded = Arc::new(skeleton);
            let mut map = LayerMap::default();
@@ -834,12 +910,18 @@ mod tests {
            let expected_in_counts = (1, usize::from(expected_l0));
-            map.batch_update().insert_historic(remote.clone());
+            map.batch_update()
                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
            let replaced = map
                .batch_update()
-                .replace_historic(&remote, downloaded.clone())
+                .replace_historic(
                    remote.get_persistent_layer_desc(),
                    &remote,
                    downloaded.get_persistent_layer_desc(),
                    downloaded.clone(),
                )
                .expect("name derived attributes are the same");
            assert!(
                matches!(replaced, Replacement::Replaced { .. }),
@@ -847,11 +929,12 @@ mod tests {
            );
            assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
-            map.batch_update().remove_historic(downloaded.clone());
+            map.batch_update()
                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
            assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
        }
-        fn count_layer_in(map: &LayerMap<dyn Layer>, layer: &Arc<dyn Layer>) -> (usize, usize) {
+        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
            let historic = map
                .iter_historic_layers()
                .filter(|x| LayerMap::compare_arced_layers(x, layer))
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -0,0 +1,325 @@
 //! This module contains the encoding and decoding of the local manifest file.
 //!
 //! MANIFEST is a write-ahead log which is stored locally to each timeline. It
 //! records the state of the storage engine. It contains a snapshot of the
 //! state and all operations proceeding that snapshot. The file begins with a
 //! header recording MANIFEST version number. After that, it contains a snapshot.
 //! The snapshot is followed by a list of operations. Each operation is a list
 //! of records. Each record is either an addition or a removal of a layer.
 //!
 //! With MANIFEST, we can:
 //!
 //! 1. recover state quickly by reading the file, potentially boosting the
 //!    startup speed.
 //! 2. ensure all operations are atomic and avoid corruption, solving issues
 //!    like redundant image layer and preparing us for future compaction
 //!    strategies.
 //!
 //! There is also a format for storing all layer files on S3, called
 //! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
 //! records all operations as logs, and therefore we can easily replay the
 //! operations when recovering from crash, while ensuring those operations
 //! are atomic upon restart.
 //!
 //! Currently, this is not used in the system. Future refactors will ensure
 //! the storage state will be recorded in this file, and the system can be
 //! recovered from this file. This is tracked in
 //! https://github.com/neondatabase/neon/issues/4418
 use std::io::{self, Read, Write};
 use crate::virtual_file::VirtualFile;
 use anyhow::Result;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use crc32c::crc32c;
 use serde::{Deserialize, Serialize};
 use tracing::log::warn;
 use utils::lsn::Lsn;
 use super::storage_layer::PersistentLayerDesc;
 pub struct Manifest {
    file: VirtualFile,
 }
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub struct Snapshot {
    pub layers: Vec<PersistentLayerDesc>,
 }
 /// serde by default encode this in tagged enum, and therefore it will be something
 /// like `{ "AddLayer": { ... } }`.
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub enum Record {
    AddLayer(PersistentLayerDesc),
    RemoveLayer(PersistentLayerDesc),
 }
 /// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
 const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
 const MANIFEST_VERSION: u64 = 1;
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub struct ManifestHeader {
    magic_number: u64,
    version: u64,
 }
 const MANIFEST_HEADER_LEN: usize = 16;
 impl ManifestHeader {
    fn encode(&self) -> BytesMut {
        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
        buf.put_u64(self.magic_number);
        buf.put_u64(self.version);
        buf
    }
    fn decode(mut buf: &[u8]) -> Self {
        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
        Self {
            magic_number: buf.get_u64(),
            version: buf.get_u64(),
        }
    }
 }
 #[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
 pub enum Operation {
    /// A snapshot of the current state.
    ///
    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
    Snapshot(Snapshot, Lsn),
    /// An atomic operation that changes the state.
    ///
    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
    /// This will only change when new L0 is flushed to the disk.
    Operation(Vec<Record>, Lsn),
 }
 struct RecordHeader {
    size: u32,
    checksum: u32,
 }
 const RECORD_HEADER_LEN: usize = 8;
 impl RecordHeader {
    fn encode(&self) -> BytesMut {
        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
        buf.put_u32(self.size);
        buf.put_u32(self.checksum);
        buf
    }
    fn decode(mut buf: &[u8]) -> Self {
        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
        Self {
            size: buf.get_u32(),
            checksum: buf.get_u32(),
        }
    }
 }
 #[derive(Debug, thiserror::Error)]
 pub enum ManifestLoadError {
    #[error("manifest header is corrupted")]
    CorruptedManifestHeader,
    #[error("unsupported manifest version: got {0}, expected {1}")]
    UnsupportedVersion(u64, u64),
    #[error("error when decoding record: {0}")]
    DecodeRecord(serde_json::Error),
    #[error("I/O error: {0}")]
    Io(io::Error),
 }
 #[must_use = "Should check if the manifest is partially corrupted"]
 pub struct ManifestPartiallyCorrupted(bool);
 impl Manifest {
    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
        let mut manifest = Self { file };
        manifest.append_manifest_header(ManifestHeader {
            magic_number: MANIFEST_MAGIC_NUMBER,
            version: MANIFEST_VERSION,
        })?;
        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
        Ok(manifest)
    }
    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
    /// backup the current one.
    pub fn load(
        mut file: VirtualFile,
    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
        let mut buf = vec![];
        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
        // Read manifest header
        let mut buf = Bytes::from(buf);
        if buf.remaining() < MANIFEST_HEADER_LEN {
            return Err(ManifestLoadError::CorruptedManifestHeader);
        }
        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
        buf.advance(MANIFEST_HEADER_LEN);
        if header.version != MANIFEST_VERSION {
            return Err(ManifestLoadError::UnsupportedVersion(
                header.version,
                MANIFEST_VERSION,
            ));
        }
        // Read operations
        let mut operations = Vec::new();
        let corrupted = loop {
            if buf.remaining() == 0 {
                break false;
            }
            if buf.remaining() < RECORD_HEADER_LEN {
                warn!("incomplete header when decoding manifest, could be corrupted");
                break true;
            }
            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
            let size = size as usize;
            buf.advance(RECORD_HEADER_LEN);
            if buf.remaining() < size {
                warn!("incomplete data when decoding manifest, could be corrupted");
                break true;
            }
            let data = &buf[..size];
            if crc32c(data) != checksum {
                warn!("checksum mismatch when decoding manifest, could be corrupted");
                break true;
            }
            // if the following decode fails, we cannot use the manifest or safely ignore any record.
            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
            buf.advance(size);
        };
        Ok((
            Self { file },
            operations,
            ManifestPartiallyCorrupted(corrupted),
        ))
    }
    fn append_data(&mut self, data: &[u8]) -> Result<()> {
        if data.len() >= u32::MAX as usize {
            panic!("data too large");
        }
        let header = RecordHeader {
            size: data.len() as u32,
            checksum: crc32c(data),
        };
        let header = header.encode();
        self.file.write_all(&header)?;
        self.file.write_all(data)?;
        self.file.sync_all()?;
        Ok(())
    }
    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
        let encoded = header.encode();
        self.file.write_all(&encoded)?;
        Ok(())
    }
    /// Add an operation to the manifest. The operation will be appended to the end of the file,
    /// and the file will fsync.
    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
        let encoded = Vec::from(serde_json::to_string(&operation)?);
        self.append_data(&encoded)
    }
 }
 #[cfg(test)]
 mod tests {
    use std::fs::OpenOptions;
    use crate::repository::Key;
    use super::*;
    #[test]
    fn test_read_manifest() {
        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
        std::fs::create_dir_all(&testdir).unwrap();
        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
        // Write a manifest with a snapshot and some operations
        let snapshot = Snapshot {
            layers: vec![layer1, layer2],
        };
        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
        manifest
            .append_operation(Operation::Operation(
                vec![Record::AddLayer(layer3.clone())],
                Lsn::from(1),
            ))
            .unwrap();
        drop(manifest);
        // Open the second time and write
        let file = VirtualFile::open_with_options(
            &testdir.join("MANIFEST"),
            OpenOptions::new()
                .read(true)
                .write(true)
                .create_new(false)
                .truncate(false),
        )
        .unwrap();
        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
        assert!(!corrupted.0);
        assert_eq!(operations.len(), 2);
        assert_eq!(
            &operations[0],
            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
        );
        assert_eq!(
            &operations[1],
            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
        );
        manifest
            .append_operation(Operation::Operation(
                vec![
                    Record::RemoveLayer(layer3.clone()),
                    Record::AddLayer(layer4.clone()),
                ],
                Lsn::from(2),
            ))
            .unwrap();
        drop(manifest);
        // Open the third time and verify
        let file = VirtualFile::open_with_options(
            &testdir.join("MANIFEST"),
            OpenOptions::new()
                .read(true)
                .write(true)
                .create_new(false)
                .truncate(false),
        )
        .unwrap();
        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
        assert!(!corrupted.0);
        assert_eq!(operations.len(), 3);
        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
        assert_eq!(
            &operations[1],
            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
        );
        assert_eq!(
            &operations[2],
            &Operation::Operation(
                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
                Lsn::from(2)
            )
        );
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,12 +20,9 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{
+use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-    create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState,
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 };
 use crate::IGNORED_TENANT_FILE_NAME;
 use utils::completion;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
@@ -67,7 +64,7 @@ pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    init_done: (completion::Completion, completion::Barrier),
+    init_order: InitializationOrder,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();
@@ -124,7 +121,7 @@ pub async fn init_tenant_mgr(
                        &tenant_dir_path,
                        broker_client.clone(),
                        remote_storage.clone(),
-                        Some(init_done.clone()),
+                        Some(init_order.clone()),
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -160,7 +157,7 @@ pub fn schedule_local_tenant_processing(
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    init_done: Option<(completion::Completion, completion::Barrier)>,
+    init_order: Option<InitializationOrder>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -219,7 +216,7 @@ pub fn schedule_local_tenant_processing(
            tenant_id,
            broker_client,
            remote_storage,
-            init_done,
+            init_order,
            ctx,
        )
    };
@@ -253,46 +250,28 @@ pub async fn shutdown_all_tenants() {
                tenants_clone
            }
            TenantsMap::ShuttingDown(_) => {
                // TODO: it is possible that detach and shutdown happen at the same time. as a
                // result, during shutdown we do not wait for detach.
                error!("already shutting down, this function isn't supposed to be called more than once");
                return;
            }
        }
    };
    // Set tenant (and its timlines) to Stoppping state.
    //
    // Since we can only transition into Stopping state after activation is complete,
    // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
    //
    // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
    // 1. Lock out any new requests to the tenants.
    // 2. Signal cancellation to WAL receivers (we wait on it below).
    // 3. Signal cancellation for other tenant background loops.
    // 4. ???
    //
    // The waiting for the cancellation is not done uniformly.
    // We certainly wait for WAL receivers to shut down.
    // That is necessary so that no new data comes in before the freeze_and_flush.
    // But the tenant background loops are joined-on in our caller.
    // It's mesed up.
    let mut join_set = JoinSet::new();
    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                match tenant.set_stopping().await {
+                let freeze_and_flush = true;
                match tenant.shutdown(freeze_and_flush).await {
                    Ok(()) => debug!("tenant successfully stopped"),
-                    Err(SetStoppingError::Broken) => {
+                    Err(super::ShutdownError::AlreadyStopping) => {
-                        info!("tenant is broken, so stopping failed, freeze_and_flush is likely going to make noise as well");
+                        warn!("tenant was already shutting down")
                    },
                    Err(SetStoppingError::AlreadyStopping) => {
                        // our task_mgr::shutdown_tasks are going to coalesce on that just fine
                    }
                }
                tenant
            }
-            .instrument(info_span!("set_stopping", %tenant_id)),
+            .instrument(info_span!("shutdown", %tenant_id)),
        );
    }
@@ -300,6 +279,7 @@ pub async fn shutdown_all_tenants() {
    while let Some(res) = join_set.join_next().await {
        match res {
            Ok(()) => {}
            Err(join_error) if join_error.is_cancelled() => {
                unreachable!("we are not cancelling any of the futures");
            }
@@ -310,50 +290,11 @@ pub async fn shutdown_all_tenants() {
            Err(join_error) => {
                warn!("unknown kind of JoinError: {join_error}");
            }
            Ok(tenant) => tenants_to_freeze_and_flush.push(tenant),
        }
    }
    if panicked > 0 {
-        warn!(panicked, "observed panicks while stopping tenants");
+        warn!(panicked, "observed panicks while shutting down tenants");
    }
    // Shut down all existing walreceiver connections and stop accepting the new ones.
    task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
    // Ok, no background tasks running anymore. Flush any remaining data in
    // memory to disk.
    //
    // We assume that any incoming connections that might request pages from
    // the tenant have already been terminated by the caller, so there
    // should be no more activity in any of the repositories.
    //
    // On error, log it but continue with the shutdown for other tenants.
    let mut join_set = tokio::task::JoinSet::new();
    for tenant in tenants_to_freeze_and_flush {
        let tenant_id = tenant.tenant_id();
        join_set.spawn(
            async move {
                if let Err(err) = tenant.freeze_and_flush().await {
                    warn!("Could not checkpoint tenant during shutdown: {err:?}");
                }
            }
            .instrument(info_span!("freeze_and_flush", %tenant_id)),
        );
    }
    while let Some(next) = join_set.join_next().await {
        match next {
            Ok(()) => {}
            Err(join_error) if join_error.is_cancelled() => {
                unreachable!("no cancelling")
            }
            Err(join_error) if join_error.is_panic() => { /* reported already */ }
            Err(join_error) => warn!("unknown kind of JoinError: {join_error}"),
        }
    }
 }
@@ -455,7 +396,9 @@ pub async fn delete_timeline(
    ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    tenant.delete_timeline(timeline_id, ctx).await?;
+    tenant
        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
        .await?;
    Ok(())
 }
@@ -669,35 +612,26 @@ where
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
-    {
+    let tenant = {
-        let tenants_accessor = TENANTS.write().await;
+        TENANTS
-        match tenants_accessor.get(&tenant_id) {
+            .write()
-            Some(tenant) => {
+            .await
-                let tenant = Arc::clone(tenant);
+            .get(&tenant_id)
-                // don't hold TENANTS lock while set_stopping waits for activation to finish
+            .cloned()
-                drop(tenants_accessor);
+            .ok_or(TenantStateError::NotFound(tenant_id))?
-                match tenant.set_stopping().await {
+    };
-                    Ok(()) => {
+
-                        // we won, continue stopping procedure
+    let freeze_and_flush = false;
-                    }
+
-                    Err(SetStoppingError::Broken) => {
+    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
-                        // continue the procedure, let's hope the closure can deal with broken tenants
+    // that we can continue safely to cleanup.
-                    }
+    match tenant.shutdown(freeze_and_flush).await {
-                    Err(SetStoppingError::AlreadyStopping) => {
+        Ok(()) => {}
-                        // the tenant is already stopping or broken, don't do anything
+        Err(super::ShutdownError::AlreadyStopping) => {
-                        return Err(TenantStateError::IsStopping(tenant_id));
+            return Err(TenantStateError::IsStopping(tenant_id))
                    }
                }
            }
            None => return Err(TenantStateError::NotFound(tenant_id)),
        }
    }
    // shutdown all tenant and timeline tasks: gc, compaction, page service)
    // No new tasks will be started for this tenant because it's in `Stopping` state.
    // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
    match tenant_cleanup
        .await
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
@@ -741,7 +675,7 @@ pub async fn immediate_gc(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
@@ -790,11 +724,11 @@ pub async fn immediate_compact(
        .get(&tenant_id)
        .map(Arc::clone)
        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)?;
+        .map_err(|e| ApiError::NotFound(e.into()))?;
    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc};
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
@@ -225,7 +227,9 @@ use crate::metrics::{
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
-    Deleted,
+    Deleted(IndexPart),
 }
 /// Errors that can arise when calling [`RemoteTimelineClient::stop`].
@@ -361,11 +365,42 @@ impl RemoteTimelineClient {
        Ok(())
    }
    /// Initialize the queue in stopped state. Used in startup path
    /// to continue deletion operation interrupted by pageserver crash or restart.
    pub fn init_upload_queue_stopped_to_continue_deletion(
        &self,
        index_part: &IndexPart,
    ) -> anyhow::Result<()> {
        // FIXME: consider newtype for DeletedIndexPart.
        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;
        {
            let mut upload_queue = self.upload_queue.lock().unwrap();
            upload_queue.initialize_with_current_remote_index_part(index_part)?;
            self.update_remote_physical_size_gauge(Some(index_part));
        }
        // also locks upload queue, without dropping the guard above it will be a deadlock
        self.stop().expect("initialized line above");
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue
            .stopped_mut()
            .expect("stopped above")
            .deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
        Ok(())
    }
    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
        match &*self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
+            UploadQueue::Stopped(q) => {
                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
            }
        }
    }
@@ -420,7 +455,7 @@ impl RemoteTimelineClient {
        .await?;
        if index_part.deleted_at.is_some() {
-            Ok(MaybeDeletedIndexPart::Deleted)
+            Ok(MaybeDeletedIndexPart::Deleted(index_part))
        } else {
            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
        }
@@ -622,7 +657,11 @@ impl RemoteTimelineClient {
            // schedule the actual deletions
            for name in names {
-                let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
+                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: false,
                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
                info!("scheduled layer file deletion {}", name.file_name());
@@ -639,18 +678,11 @@ impl RemoteTimelineClient {
    /// Wait for all previously scheduled uploads/deletions to complete
    ///
    pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let (sender, mut receiver) = tokio::sync::watch::channel(());
+        let mut receiver = {
        let barrier_op = UploadOp::Barrier(sender);
        {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            upload_queue.queued_operations.push_back(barrier_op);
+            self.schedule_barrier(upload_queue)
-            // Don't count this kind of operation!
+        };
            // Launch the task immediately, if possible
            self.launch_queued_tasks(upload_queue);
        }
        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
@@ -658,6 +690,22 @@ impl RemoteTimelineClient {
        Ok(())
    }
    fn schedule_barrier(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
    ) -> tokio::sync::watch::Receiver<()> {
        let (sender, receiver) = tokio::sync::watch::channel(());
        let barrier_op = UploadOp::Barrier(sender);
        upload_queue.queued_operations.push_back(barrier_op);
        // Don't count this kind of operation!
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
        receiver
    }
    /// Set the deleted_at field in the remote index file.
    ///
    /// This fails if the upload queue has not been `stop()`ed.
@@ -665,6 +713,7 @@ impl RemoteTimelineClient {
    /// The caller is responsible for calling `stop()` AND for waiting
    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
    /// Check method [`RemoteTimelineClient::stop`] for details.
    #[instrument(skip_all)]
    pub(crate) async fn persist_index_part_with_deleted_flag(
        self: &Arc<Self>,
    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
@@ -674,15 +723,7 @@ impl RemoteTimelineClient {
            // We must be in stopped state because otherwise
            // we can have inprogress index part upload that can overwrite the file
            // with missing is_deleted flag that we going to set below
-            let stopped = match &mut *locked {
+            let stopped = locked.stopped_mut()?;
                UploadQueue::Uninitialized => {
                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
                }
                UploadQueue::Initialized(_) => {
                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
                }
                UploadQueue::Stopped(stopped) => stopped,
            };
            match stopped.deleted_at {
                SetDeletedFlagProgress::NotRunning => (), // proceed
@@ -696,48 +737,34 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
-            let mut index_part = IndexPart::new(
+            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
-                stopped.latest_files.clone(),
+                .context("IndexPart serialize")?;
                stopped.last_uploaded_consistent_lsn,
                stopped
                    .latest_metadata
                    .to_bytes()
                    .context("serialize metadata")?,
            );
            index_part.deleted_at = Some(deleted_at);
            index_part
        };
        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
            let mut locked = self_clone.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
+            let stopped = locked
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                .stopped_mut()
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
                    locked.as_str(),
                ),
                UploadQueue::Stopped(stopped) => stopped,
            };
            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
        });
        // Have a failpoint that can use the `pause` failpoint action.
        // We don't want to block the executor thread, hence, spawn_blocking + await.
-        #[cfg(feature = "testing")]
+        if cfg!(feature = "testing") {
-        tokio::task::spawn_blocking({
+            tokio::task::spawn_blocking({
-            let current = tracing::Span::current();
+                let current = tracing::Span::current();
-            move || {
+                move || {
-                let _entered = current.entered();
+                    let _entered = current.entered();
-                tracing::info!(
+                    tracing::info!("at failpoint persist_deleted_index_part");
-                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                    fail::fail_point!("persist_deleted_index_part");
-                );
+                }
-                fail::fail_point!(
+            })
-                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+            .await
-                );
+            .expect("spawn_blocking");
-            }
+        }
        })
        .await
        .expect("spawn_blocking");
        upload::upload_index_part(
            self.conf,
            &self.storage_impl,
@@ -751,13 +778,10 @@ impl RemoteTimelineClient {
        ScopeGuard::into_inner(undo_deleted_at);
        {
            let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
+
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+            let stopped = locked
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                .stopped_mut()
-                    locked.as_str(),
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
                ),
                UploadQueue::Stopped(stopped) => stopped,
            };
            stopped.deleted_at = SetDeletedFlagProgress::Successful(
                index_part_with_deleted_at
                    .deleted_at
@@ -768,6 +792,90 @@ impl RemoteTimelineClient {
        Ok(())
    }
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_and_timeline_id();
        let (mut receiver, deletions_queued) = {
            let mut deletions_queued = 0;
            let mut locked = self.upload_queue.lock().unwrap();
            let stopped = locked.stopped_mut()?;
            if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
                anyhow::bail!("deleted_at is not set")
            }
            debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
            stopped
                .upload_queue_for_deletion
                .queued_operations
                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
            // schedule the actual deletions
            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
                let op = UploadOp::Delete(Delete {
                    file_kind: RemoteOpFileKind::Layer,
                    layer_file_name: name.clone(),
                    scheduled_from_timeline_delete: true,
                });
                self.calls_unfinished_metric_begin(&op);
                stopped
                    .upload_queue_for_deletion
                    .queued_operations
                    .push_back(op);
                info!("scheduled layer file deletion {}", name.file_name());
                deletions_queued += 1;
            }
            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
            (
                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
                deletions_queued,
            )
        };
        receiver.changed().await?;
        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
        let remaining = self
            .storage_impl
            .list_prefixes(Some(&timeline_storage_path))
            .await?;
        let remaining: Vec<RemotePath> = remaining
            .into_iter()
            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
            .collect();
        if !remaining.is_empty() {
            warn!(
                "Found {} files not bound to index_file.json, proceeding with their deletion",
                remaining.len()
            );
            warn!("About to remove {} files", remaining.len());
            self.storage_impl.delete_objects(&remaining).await?;
        }
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;
        info!(deletions_queued, "done deleting, including index_part.json");
        Ok(())
    }
    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
@@ -786,7 +894,7 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }
@@ -817,7 +925,7 @@ impl RemoteTimelineClient {
                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
@@ -891,7 +999,6 @@ impl RemoteTimelineClient {
                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
                    }
                }
                self.calls_unfinished_metric_end(&task.op);
                return;
            }
@@ -937,16 +1044,16 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
+                UploadOp::Delete(delete) => {
                    let path = &self
                        .conf
                        .timeline_path(&self.timeline_id, &self.tenant_id)
-                        .join(layer_file_name.file_name());
+                        .join(delete.layer_file_name.file_name());
                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
-                            *metric_file_kind,
+                            delete.file_kind,
                            RemoteOpKind::Delete,
                            Arc::clone(&self.metrics),
                        )
@@ -1012,11 +1119,24 @@ impl RemoteTimelineClient {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_) => {
+                UploadQueue::Stopped(stopped) => {
                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
                    // then stop() took care of it so we just return.
                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
                    match &task.op {
                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
                        _ => None
                    }
                },
                UploadQueue::Initialized(qi) => { Some(qi) }
            };
            let upload_queue = match upload_queue {
                Some(upload_queue) => upload_queue,
                None => {
                    info!("another concurrent task already stopped the queue");
                    return;
-                }, // nothing to do
+                }
                UploadQueue::Initialized(qi) => { qi }
            };
            upload_queue.inprogress_tasks.remove(&task.task_id);
@@ -1029,7 +1149,7 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                    upload_queue.num_inprogress_deletions -= 1;
                }
                UploadOp::Barrier(_) => unreachable!(),
@@ -1063,8 +1183,8 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
-            UploadOp::Delete(file_kind, _) => (
+            UploadOp::Delete(delete) => (
-                *file_kind,
+                delete.file_kind,
                RemoteOpKind::Delete,
                DontTrackSize {
                    reason: "should we track deletes? positive or negative sign?",
@@ -1111,32 +1231,36 @@ impl RemoteTimelineClient {
                info!("another concurrent task already shut down the queue");
                Ok(())
            }
-            UploadQueue::Initialized(UploadQueueInitialized {
+            UploadQueue::Initialized(initialized) => {
                latest_files,
                latest_metadata,
                last_uploaded_consistent_lsn,
                ..
            }) => {
                info!("shutting down upload queue");
                // Replace the queue with the Stopped state, taking ownership of the old
                // Initialized queue. We will do some checks on it, and then drop it.
                let qi = {
-                    // take or clone what we need
+                    // Here we preserve working version of the upload queue for possible use during deletions.
-                    let latest_files = std::mem::take(latest_files);
+                    // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
-                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
+                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
-                    // this could be Copy
+                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
-                    let latest_metadata = latest_metadata.clone();
+                    let upload_queue_for_deletion = UploadQueueInitialized {
-
+                        task_counter: 0,
-                    let stopped = UploadQueueStopped {
+                        latest_files: initialized.latest_files.clone(),
-                        latest_files,
+                        latest_files_changes_since_metadata_upload_scheduled: 0,
-                        last_uploaded_consistent_lsn,
+                        latest_metadata: initialized.latest_metadata.clone(),
-                        latest_metadata,
+                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
-                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                        num_inprogress_layer_uploads: 0,
                        num_inprogress_metadata_uploads: 0,
                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                    };
-                    let upload_queue =
+                    let upload_queue = std::mem::replace(
-                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
+                        &mut *guard,
                        UploadQueue::Stopped(UploadQueueStopped {
                            upload_queue_for_deletion,
                            deleted_at: SetDeletedFlagProgress::NotRunning,
                        }),
                    );
                    if let UploadQueue::Initialized(qi) = upload_queue {
                        qi
                    } else {
@@ -1144,8 +1268,6 @@ impl RemoteTimelineClient {
                    }
                };
                assert!(qi.latest_files.is_empty(), "do not use this anymore");
                // consistency check
                assert_eq!(
                    qi.num_inprogress_layer_uploads
@@ -1243,7 +1365,7 @@ mod tests {
    struct TestSetup {
        runtime: &'static tokio::runtime::Runtime,
        entered_runtime: EnterGuard<'static>,
-        harness: TenantHarness<'static>,
+        harness: TenantHarness,
        tenant: Arc<Tenant>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
@@ -1264,7 +1386,12 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = runtime.block_on(harness.load());
            // create an empty timeline directory
-            let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let _ = runtime.block_on(tenant.create_test_timeline(
                TIMELINE_ID,
                Lsn(8),
                DEFAULT_PG_VERSION,
                &ctx,
            ))?;
            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1408,7 +1535,7 @@ mod tests {
        // Download back the index.json, and check that the list of files is correct
        let index_part = match runtime.block_on(client.download_index_file())? {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
        assert_file_list(
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet};
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::bin_ser::SerializeError;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
 use utils::lsn::Lsn;
@@ -115,6 +117,21 @@ impl IndexPart {
    }
 }
 impl TryFrom<&UploadQueueInitialized> for IndexPart {
    type Error = SerializeError;
    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
        Ok(Self::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
            metadata_bytes,
        ))
    }
 }
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -38,7 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
-pub use layer_desc::PersistentLayerDesc;
+pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use remote_layer::RemoteLayer;
 use super::layer_map::BatchedUpdates;
@@ -389,10 +389,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
 }
 /// Returned by [`Layer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
 /// Returned by [`Layer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
@@ -454,7 +454,9 @@ pub trait PersistentLayer: Layer {
    ///
    /// Should not change over the lifetime of the layer object because
    /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> u64;
+    fn file_size(&self) -> u64 {
        self.layer_desc().file_size
    }
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
@@ -483,6 +485,20 @@ pub struct LayerDescriptor {
    pub short_id: String,
 }
 impl LayerDescriptor {
    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
    /// and the tenant / timeline id does not matter.
    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
        PersistentLayerDesc::new_delta(
            TenantId::from_array([0; 16]),
            TimelineId::from_array([0; 16]),
            self.key.clone(),
            self.lsn.clone(),
            233,
        )
    }
 }
 impl Layer for LayerDescriptor {
    fn get_key_range(&self) -> Range<Key> {
        self.key.clone()
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -46,7 +47,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tracing::*;
 use utils::{
@@ -182,11 +182,9 @@ pub struct DeltaLayer {
    pub desc: PersistentLayerDesc,
    pub file_size: u64,
    access_stats: LayerAccessStats,
-    inner: RwLock<DeltaLayerInner>,
+    inner: OnceCell<DeltaLayerInner>,
 }
 impl std::fmt::Debug for DeltaLayer {
@@ -196,28 +194,24 @@ impl std::fmt::Debug for DeltaLayer {
        f.debug_struct("DeltaLayer")
            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
            .field("lsn_range", &self.desc.lsn_range)
-            .field("file_size", &self.file_size)
+            .field("file_size", &self.desc.file_size)
            .field("inner", &self.inner)
            .finish()
    }
 }
 pub struct DeltaLayerInner {
    /// If false, the fields below have not been loaded into memory yet.
    loaded: bool,
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
-    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    /// Reader object for reading blocks from the file.
-    file: Option<FileBlockReader<VirtualFile>>,
+    file: FileBlockReader<VirtualFile>,
 }
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
@@ -248,7 +242,7 @@ impl Layer for DeltaLayer {
            inner.index_start_blk, inner.index_root_blk
        );
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -317,7 +311,7 @@ impl Layer for DeltaLayer {
            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
            // Scan the page versions backwards, starting from `lsn`.
-            let file = inner.file.as_ref().unwrap();
+            let file = &inner.file;
            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
@@ -439,10 +433,6 @@ impl PersistentLayer for DeltaLayer {
        Ok(())
    }
    fn file_size(&self) -> u64 {
        self.file_size
    }
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();
@@ -451,7 +441,7 @@ impl PersistentLayer for DeltaLayer {
        HistoricLayerInfo::Delta {
            layer_file_name,
-            layer_file_size: self.file_size,
+            layer_file_size: self.desc.file_size,
            lsn_start: lsn_range.start,
            lsn_end: lsn_range.end,
            remote: false,
@@ -506,51 +496,22 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
        &self,
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
    ) -> Result<RwLockReadGuard<DeltaLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
-        loop {
+        // Quick exit if already loaded
-            // Quick exit if already loaded
+        self.inner
-            let inner = self.inner.read().unwrap();
+            .get_or_try_init(|| self.load_inner())
-            if inner.loaded {
+            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
                return Ok(inner);
            }
            // Need to open the file and load the metadata. Upgrade our lock to
            // a write lock. (Or rather, release and re-lock in write mode.)
            drop(inner);
            let inner = self.inner.write().unwrap();
            if !inner.loaded {
                self.load_inner(inner).with_context(|| {
                    format!("Failed to load delta layer {}", self.path().display())
                })?;
            } else {
                // Another thread loaded it while we were not holding the lock.
            }
            // We now have the file open and loaded. There's no function to do
            // that in the std library RwLock, so we have to release and re-lock
            // in read mode. (To be precise, the lock guard was moved in the
            // above call to `load_inner`, so it's already been released). And
            // while we do that, another thread could unload again, so we have
            // to re-check and retry if that happens.
        }
    }
-    fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
+    fn load_inner(&self) -> Result<DeltaLayerInner> {
        let path = self.path();
-        // Open the file if it's not open already.
+        let file = VirtualFile::open(&path)
-        if inner.file.is_none() {
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            let file = VirtualFile::open(&path)
+        let file = FileBlockReader::new(file);
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+
            inner.file = Some(FileBlockReader::new(file));
        }
        let file = inner.file.as_mut().unwrap();
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -577,13 +538,13 @@ impl DeltaLayer {
            }
        }
        inner.index_start_blk = actual_summary.index_start_blk;
        inner.index_root_blk = actual_summary.index_root_blk;
        debug!("loaded from {}", &path.display());
-        inner.loaded = true;
+        Ok(DeltaLayerInner {
-        Ok(())
+            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
        })
    }
    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -602,15 +563,10 @@ impl DeltaLayer {
                timeline_id,
                filename.key_range.clone(),
                filename.lsn_range.clone(),
                file_size,
            ),
            file_size,
            access_stats,
-            inner: RwLock::new(DeltaLayerInner {
+            inner: once_cell::sync::OnceCell::new(),
                loaded: false,
                file: None,
                index_start_blk: 0,
                index_root_blk: 0,
            }),
        }
    }
@@ -634,15 +590,10 @@ impl DeltaLayer {
                summary.timeline_id,
                summary.key_range,
                summary.lsn_range,
                metadata.len(),
            ),
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
+            inner: once_cell::sync::OnceCell::new(),
                loaded: false,
                file: None,
                index_start_blk: 0,
                index_root_blk: 0,
            }),
        })
    }
@@ -803,15 +754,10 @@ impl DeltaLayerWriterInner {
                self.timeline_id,
                self.key_start..key_end,
                self.lsn_range.clone(),
                metadata.len(),
            ),
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(DeltaLayerInner {
+            inner: once_cell::sync::OnceCell::new(),
                loaded: false,
                file: None,
                index_start_blk,
                index_root_blk,
            }),
        };
        // fsync the file
@@ -946,13 +892,13 @@ struct DeltaValueIter<'a> {
    reader: BlockCursor<Adapter<'a>>,
 }
-struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
+struct Adapter<'a>(&'a DeltaLayerInner);
 impl<'a> BlockReader for Adapter<'a> {
    type BlockLease = PageReadGuard<'static>;
    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.as_ref().unwrap().read_blk(blknum)
+        self.0.file.read_blk(blknum)
    }
 }
@@ -965,8 +911,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
 }
 impl<'a> DeltaValueIter<'a> {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1039,8 +985,8 @@ impl Iterator for DeltaKeyIter {
 }
 impl<'a> DeltaKeyIter {
-    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            inner.index_start_blk,
            inner.index_root_blk,
@@ -1080,3 +1026,21 @@ impl<'a> DeltaKeyIter {
        Ok(iter)
    }
 }
 #[cfg(test)]
 mod test {
    use super::DeltaKeyIter;
    use super::DeltaLayer;
    use super::DeltaValueIter;
    // We will soon need the iters to be send in the compaction code.
    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
    // Cf https://github.com/neondatabase/neon/issues/4471
    #[test]
    fn is_send() {
        fn assert_send<T: Send>() {}
        assert_send::<DeltaLayer>();
        assert_send::<DeltaValueIter>();
        assert_send::<DeltaKeyIter>();
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -109,8 +109,6 @@ pub struct ImageLayer {
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
    pub file_size: u64,
    access_stats: LayerAccessStats,
    inner: RwLock<ImageLayerInner>,
@@ -122,7 +120,7 @@ impl std::fmt::Debug for ImageLayer {
        f.debug_struct("ImageLayer")
            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
-            .field("file_size", &self.file_size)
+            .field("file_size", &self.desc.file_size)
            .field("lsn", &self.lsn)
            .field("inner", &self.inner)
            .finish()
@@ -258,17 +256,13 @@ impl PersistentLayer for ImageLayer {
        Ok(())
    }
    fn file_size(&self) -> u64 {
        self.file_size
    }
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();
        HistoricLayerInfo::Image {
            layer_file_name,
-            layer_file_size: self.file_size,
+            layer_file_size: self.desc.file_size,
            lsn_start: lsn_range.start,
            remote: false,
            access_stats: self.access_stats.as_api_model(reset),
@@ -411,9 +405,9 @@ impl ImageLayer {
                filename.key_range.clone(),
                filename.lsn,
                false,
                file_size,
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            file_size,
            access_stats,
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
@@ -443,9 +437,9 @@ impl ImageLayer {
                summary.key_range,
                summary.lsn,
                false,
                metadata.len(),
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                file: None,
@@ -578,14 +572,6 @@ impl ImageLayerWriterInner {
            file.write_all(buf.as_ref())?;
        }
        let desc = PersistentLayerDesc::new_img(
            self.tenant_id,
            self.timeline_id,
            self.key_range.clone(),
            self.lsn,
            self.is_incremental, // for now, image layer ALWAYS covers the full range
        );
        // Fill in the summary on blk 0
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
@@ -604,6 +590,15 @@ impl ImageLayerWriterInner {
            .metadata()
            .context("get metadata to determine file size")?;
        let desc = PersistentLayerDesc::new_img(
            self.tenant_id,
            self.timeline_id,
            self.key_range.clone(),
            self.lsn,
            self.is_incremental, // for now, image layer ALWAYS covers the full range
            metadata.len(),
        );
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -611,7 +606,6 @@ impl ImageLayerWriterInner {
            path_or_conf: PathOrConf::Conf(self.conf),
            desc,
            lsn: self.lsn,
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -304,7 +304,7 @@ impl InMemoryLayer {
        Ok(())
    }
-    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,17 +1,20 @@
 use anyhow::Result;
 use std::ops::Range;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
-use crate::repository::Key;
+use crate::{context::RequestContext, repository::Key};
 use super::{DeltaFileName, ImageFileName, LayerFileName};
 use serde::{Deserialize, Serialize};
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct PersistentLayerDesc {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -24,19 +27,51 @@ pub struct PersistentLayerDesc {
    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
    /// incremental.
    pub is_incremental: bool,
    /// File size
    pub file_size: u64,
 }
 /// A unique identifier of a persistent layer within the context of one timeline.
 #[derive(Debug, PartialEq, Eq, Clone, Hash)]
 pub struct PersistentLayerKey {
    pub key_range: Range<Key>,
    pub lsn_range: Range<Lsn>,
    pub is_delta: bool,
 }
 impl PersistentLayerDesc {
    pub fn key(&self) -> PersistentLayerKey {
        PersistentLayerKey {
            key_range: self.key_range.clone(),
            lsn_range: self.lsn_range.clone(),
            is_delta: self.is_delta,
        }
    }
    pub fn short_id(&self) -> String {
        self.filename().file_name()
    }
    #[cfg(test)]
    pub fn new_test(key_range: Range<Key>) -> Self {
        Self {
            tenant_id: TenantId::generate(),
            timeline_id: TimelineId::generate(),
            key_range,
            lsn_range: Lsn(0)..Lsn(1),
            is_delta: false,
            is_incremental: false,
            file_size: 0,
        }
    }
    pub fn new_img(
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn: Lsn,
        is_incremental: bool,
        file_size: u64,
    ) -> Self {
        Self {
            tenant_id,
@@ -45,6 +80,7 @@ impl PersistentLayerDesc {
            lsn_range: Self::image_layer_lsn_range(lsn),
            is_delta: false,
            is_incremental,
            file_size,
        }
    }
@@ -53,6 +89,7 @@ impl PersistentLayerDesc {
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
        file_size: u64,
    ) -> Self {
        Self {
            tenant_id,
@@ -61,6 +98,7 @@ impl PersistentLayerDesc {
            lsn_range,
            is_delta: true,
            is_incremental: true,
            file_size,
        }
    }
@@ -106,4 +144,48 @@ impl PersistentLayerDesc {
            self.image_file_name().into()
        }
    }
    // TODO: remove this in the future once we refactor timeline APIs.
    pub fn get_lsn_range(&self) -> Range<Lsn> {
        self.lsn_range.clone()
    }
    pub fn get_key_range(&self) -> Range<Key> {
        self.key_range.clone()
    }
    pub fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }
    pub fn get_tenant_id(&self) -> TenantId {
        self.tenant_id
    }
    pub fn is_incremental(&self) -> bool {
        self.is_incremental
    }
    pub fn is_delta(&self) -> bool {
        self.is_delta
    }
    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.tenant_id,
            self.timeline_id,
            self.key_range.start,
            self.key_range.end,
            self.lsn_range.start,
            self.lsn_range.end
        );
        Ok(())
    }
    pub fn file_size(&self) -> u64 {
        self.file_size
    }
 }
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -142,10 +142,6 @@ impl PersistentLayer for RemoteLayer {
        true
    }
    fn file_size(&self) -> u64 {
        self.layer_metadata.file_size()
    }
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();
@@ -190,6 +186,7 @@ impl RemoteLayer {
                fname.key_range.clone(),
                fname.lsn,
                false,
                layer_metadata.file_size(),
            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
@@ -211,6 +208,7 @@ impl RemoteLayer {
                timelineid,
                fname.key_range.clone(),
                fname.lsn_range.clone(),
                layer_metadata.file_size(),
            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,7 +14,11 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
-pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
+/// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
    tenant: &Arc<Tenant>,
    background_jobs_can_start: Option<&completion::Barrier>,
 ) {
    let tenant_id = tenant.tenant_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
@@ -25,10 +29,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
        false,
        {
            let tenant = Arc::clone(tenant);
-            let init_done = init_done.cloned();
+            let background_jobs_can_start = background_jobs_can_start.cloned();
            async move {
-                completion::Barrier::maybe_wait(init_done).await;
+                let cancel = task_mgr::shutdown_token();
-                compaction_loop(tenant)
+                tokio::select! {
                    _ = cancel.cancelled() => { return Ok(()) },
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                compaction_loop(tenant, cancel)
                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
                    .await;
                Ok(())
@@ -44,10 +52,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
        false,
        {
            let tenant = Arc::clone(tenant);
-            let init_done = init_done.cloned();
+            let background_jobs_can_start = background_jobs_can_start.cloned();
            async move {
-                completion::Barrier::maybe_wait(init_done).await;
+                let cancel = task_mgr::shutdown_token();
-                gc_loop(tenant)
+                tokio::select! {
                    _ = cancel.cancelled() => { return Ok(()) },
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                gc_loop(tenant, cancel)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
                    .await;
                Ok(())
@@ -59,12 +71,11 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant: Arc<Tenant>) {
+async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let cancel = task_mgr::shutdown_token();
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
@@ -129,12 +140,11 @@ async fn compaction_loop(tenant: Arc<Tenant>) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant: Arc<Tenant>) {
+async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let cancel = task_mgr::shutdown_token();
        // GC might require downloading, to find the cutoff LSN that corresponds to the
        // cutoff specified as time.
        let ctx =
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -34,6 +34,8 @@ use crate::{
    },
 };
 use utils::completion;
 use super::Timeline;
 #[derive(Default)]
@@ -47,8 +49,12 @@ pub struct EvictionTaskTenantState {
 }
 impl Timeline {
-    pub(super) fn launch_eviction_task(self: &Arc<Self>) {
+    pub(super) fn launch_eviction_task(
        self: &Arc<Self>,
        background_tasks_can_start: Option<&completion::Barrier>,
    ) {
        let self_clone = Arc::clone(self);
        let background_tasks_can_start = background_tasks_can_start.cloned();
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
@@ -57,7 +63,13 @@ impl Timeline {
            &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
            false,
            async move {
-                self_clone.eviction_task(task_mgr::shutdown_token()).await;
+                let cancel = task_mgr::shutdown_token();
                tokio::select! {
                    _ = cancel.cancelled() => { return Ok(()); }
                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
                };
                self_clone.eviction_task(cancel).await;
                info!("eviction task finishing");
                Ok(())
            },
@@ -185,7 +197,7 @@ impl Timeline {
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read().unwrap();
+            let layers = self.layers.read().await;
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                if hist_layer.is_remote_layer() {
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -25,6 +25,7 @@ mod walreceiver_connection;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };
@@ -85,7 +86,8 @@ impl WalReceiver {
            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
            false,
            async move {
-                info!("WAL receiver manager started, connecting to broker");
+                debug_assert_current_span_has_tenant_and_timeline_id();
                debug!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
                    conf,
@@ -93,7 +95,7 @@ impl WalReceiver {
                loop {
                    select! {
                        _ = task_mgr::shutdown_watcher() => {
-                            info!("WAL receiver shutdown requested, shutting down");
+                            trace!("WAL receiver shutdown requested, shutting down");
                            break;
                        },
                        loop_step_result = connection_manager_loop_step(
@@ -104,7 +106,7 @@ impl WalReceiver {
                        ) => match loop_step_result {
                            ControlFlow::Continue(()) => continue,
                            ControlFlow::Break(()) => {
-                                info!("Connection manager loop ended, shutting down");
+                                trace!("Connection manager loop ended, shutting down");
                                break;
                            }
                        },
@@ -115,7 +117,7 @@ impl WalReceiver {
                *loop_status.write().unwrap() = None;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
        );
        Self {
@@ -198,29 +200,19 @@ impl<E: Clone> TaskHandle<E> {
                TaskEvent::End(match self.join_handle.as_mut() {
                    Some(jh) => {
                        if !jh.is_finished() {
-                            // Barring any implementation errors in this module, we can
+                            // See: https://github.com/neondatabase/neon/issues/2885
-                            // only arrive here while the task that executes the future
+                            trace!("sender is dropped while join handle is still alive");
                            // passed to `Self::spawn()` is still execution. Cf the comment
                            // in Self::spawn().
                            //
                            // This was logging at warning level in earlier versions, presumably
                            // to leave some breadcrumbs in case we had an implementation
                            // error that would would make us get stuck in `jh.await`.
                            //
                            // There hasn't been such a bug so far.
                            // But in a busy system, e.g., during pageserver restart,
                            // we arrive here often enough that the warning-level logs
                            // became a distraction.
                            // So, tone them down to info-level.
                            //
                            // XXX: rewrite this module to eliminate the race condition.
                            info!("sender is dropped while join handle is still alive");
                        }
-                        let res = jh
+                        let res = match jh.await {
-                            .await
+                            Ok(res) => res,
-                            .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
+                            Err(je) if je.is_cancelled() => unreachable!("not used"),
-                            .and_then(|x| x);
+                            Err(je) if je.is_panic() => {
                                // already logged
                                Ok(())
                            }
                            Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
                        };
                        // For cancellation-safety, drop join_handle only after successful .await.
                        self.join_handle = None;
@@ -243,12 +235,12 @@ impl<E: Clone> TaskHandle<E> {
            match jh.await {
                Ok(Ok(())) => debug!("Shutdown success"),
                Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                Err(join_error) => {
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                    if join_error.is_cancelled() {
+                Err(je) if je.is_panic() => {
-                        error!("Shutdown task was cancelled");
+                    // already logged
-                    } else {
+                }
-                        error!("Shutdown task join error: {join_error}")
+                Err(je) => {
-                    }
+                    error!("Shutdown task join error: {je}")
                }
            }
        }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -18,7 +18,7 @@ use crate::metrics::{
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::Timeline;
+use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -55,8 +55,11 @@ pub(super) async fn connection_manager_loop_step(
        .await
    {
        Ok(()) => {}
-        Err(_) => {
+        Err(new_state) => {
-            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
+            debug!(
                ?new_state,
                "state changed, stopping wal connection manager loop"
            );
            return ControlFlow::Break(());
        }
    }
@@ -79,7 +82,7 @@ pub(super) async fn connection_manager_loop_step(
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
-    info!("Subscribed for broker timeline updates");
+    debug!("Subscribed for broker timeline updates");
    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
@@ -150,13 +153,13 @@ pub(super) async fn connection_manager_loop_step(
                            match new_state {
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping => {
+                                TimelineState::Broken { .. } | TimelineState::Stopping => {
-                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                    return ControlFlow::Break(());
                                }
                                TimelineState::Loading => {
                                    warn!("timeline transitioned back to Loading state, that should not happen");
-                                    return ControlFlow::Continue(new_state);
+                                    return ControlFlow::Continue(());
                                }
                            }
                        }
@@ -164,12 +167,11 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => match new_event {
-                ControlFlow::Continue(new_state) => {
+                ControlFlow::Continue(()) => {
                    info!("observed timeline state change, new state is {new_state:?}");
                    return ControlFlow::Continue(());
                }
                ControlFlow::Break(()) => {
-                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
+                    debug!("Timeline is no longer active, stopping wal connection manager loop");
                    return ControlFlow::Break(());
                }
            },
@@ -390,7 +392,6 @@ impl ConnectionManagerState {
        self.drop_old_connection(true).await;
        let id = self.id;
        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
        let timeline = Arc::clone(&self.timeline);
@@ -398,9 +399,13 @@ impl ConnectionManagerState {
            TaskKind::WalReceiverConnectionHandler,
            DownloadBehavior::Download,
        );
        let span = info_span!("connection", %node_id);
        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
            async move {
-                super::walreceiver_connection::handle_walreceiver_connection(
+                debug_assert_current_span_has_tenant_and_timeline_id();
                let res = super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
                    new_sk.wal_source_connconf,
                    events_sender,
@@ -409,12 +414,23 @@ impl ConnectionManagerState {
                    ctx,
                    node_id,
                )
-                .await
+                .await;
-                .context("walreceiver connection handling failure")
+
                match res {
                    Ok(()) => Ok(()),
                    Err(e) => {
                        use super::walreceiver_connection::ExpectedError;
                        if e.is_expected() {
                            info!("walreceiver connection handling ended: {e:#}");
                            Ok(())
                        } else {
                            // give out an error to have task_mgr give it a really verbose logging
                            Err(e).context("walreceiver connection handling failure")
                        }
                    }
                }
            }
-            .instrument(
+            .instrument(span)
                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
            )
        });
        let now = Utc::now().naive_utc();
@@ -1305,10 +1321,11 @@ mod tests {
    const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
-    async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
+    async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
            .await
            .expect("Failed to create an empty timeline for dummy wal connection manager");
        ConnectionManagerState {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, trace, warn};
+use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
-    tenant::{Timeline, WalReceiverInfo},
+    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
 };
@@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection(
        config.application_name("pageserver");
        config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
-            Ok(Ok(client_and_conn)) => client_and_conn,
+            Ok(client_and_conn) => client_and_conn?,
-            Ok(Err(conn_err)) => {
+            Err(_elapsed) => {
                let expected_error = ignore_expected_errors(conn_err)?;
                info!("DB connection stream finished: {expected_error}");
                return Ok(());
            }
            Err(_) => {
                // Timing out to connect to a safekeeper node could happen long time, due to
                // many reasons that pageserver cannot control.
                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
@@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection(
        }
    };
-    info!("connected!");
+    debug!("connected!");
    let mut connection_status = WalConnectionStatus {
        is_connected: true,
        has_processed_wal: false,
@@ -127,20 +122,25 @@ pub(super) async fn handle_walreceiver_connection(
        "walreceiver connection",
        false,
        async move {
            debug_assert_current_span_has_tenant_and_timeline_id();
            select! {
                connection_result = connection => match connection_result {
-                    Ok(()) => info!("Walreceiver db connection closed"),
+                    Ok(()) => debug!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        if let Err(e) = ignore_expected_errors(connection_error) {
+                        if connection_error.is_expected() {
-                            warn!("Connection aborted: {e:#}")
+                            // silence, because most likely we've already exited the outer call
                            // with a similar error.
                        } else {
                            warn!("Connection aborted: {connection_error:#}")
                        }
                    }
                },
-                // Future: replace connection_cancellation with connection_ctx cancellation
+                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
                _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
            }
            Ok(())
-        },
+        }
        .instrument(tracing::info_span!("poller")),
    );
    // Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -203,20 +203,13 @@ pub(super) async fn handle_walreceiver_connection(
    while let Some(replication_message) = {
        select! {
            _ = cancellation.cancelled() => {
-                info!("walreceiver interrupted");
+                debug!("walreceiver interrupted");
                None
            }
            replication_message = physical_stream.next() => replication_message,
        }
    } {
-        let replication_message = match replication_message {
+        let replication_message = replication_message?;
            Ok(message) => message,
            Err(replication_error) => {
                let expected_error = ignore_expected_errors(replication_error)?;
                info!("Replication stream finished: {expected_error}");
                return Ok(());
            }
        };
        let now = Utc::now().naive_utc();
        let last_rec_lsn_before_msg = last_rec_lsn;
@@ -261,8 +254,6 @@ pub(super) async fn handle_walreceiver_connection(
                    let mut decoded = DecodedWALRecord::default();
                    let mut modification = timeline.begin_modification(endlsn);
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                        // let _enter = info_span!("processing record", lsn = %lsn).entered();
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
@@ -313,12 +304,15 @@ pub(super) async fn handle_walreceiver_connection(
            }
        }
-        timeline.check_checkpoint_distance().with_context(|| {
+        timeline
-            format!(
+            .check_checkpoint_distance()
-                "Failed to check checkpoint distance for timeline {}",
+            .await
-                timeline.timeline_id
+            .with_context(|| {
-            )
+                format!(
-        })?;
+                    "Failed to check checkpoint distance for timeline {}",
                    timeline.timeline_id
                )
            })?;
        if let Some(last_lsn) = status_update {
            let timeline_remote_consistent_lsn =
@@ -421,31 +415,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
    }
 }
-/// We don't want to report connectivity problems as real errors towards connection manager because
+/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
-/// 1. they happen frequently enough to make server logs hard to read and
+pub(super) trait ExpectedError {
-/// 2. the connection manager can retry other safekeeper.
+    /// Test if this error is an ok error.
-///
+    ///
-/// If this function returns `Ok(pg_error)`, it's such an error.
+    /// We don't want to report connectivity problems as real errors towards connection manager because
-/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+    /// 1. they happen frequently enough to make server logs hard to read and
-/// Connection manager will then handle reconnections.
+    /// 2. the connection manager can retry other safekeeper.
-///
+    ///
-/// If this function returns an `Err()`, the caller can bubble it up using `?`.
+    /// If this function returns `true`, it's such an error.
-/// The connection manager will log the error at ERROR level.
+    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
-fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
+    /// Connection manager will then handle reconnections.
-    if pg_error.is_closed()
+    ///
-        || pg_error
+    /// If this function returns an `false` the error should be propagated and the connection manager
-            .source()
+    /// will log the error at ERROR level.
-            .and_then(|source| source.downcast_ref::<std::io::Error>())
+    fn is_expected(&self) -> bool;
-            .map(is_expected_io_error)
+}
-            .unwrap_or(false)
+
-    {
+impl ExpectedError for postgres::Error {
-        return Ok(pg_error);
+    fn is_expected(&self) -> bool {
-    } else if let Some(db_error) = pg_error.as_db_error() {
+        self.is_closed()
-        if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+            || self
-            && db_error.message().contains("ending streaming")
+                .source()
-        {
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
-            return Ok(pg_error);
+                .map(is_expected_io_error)
-        }
+                .unwrap_or(false)
-    }
+            || self
-    Err(pg_error).context("connection error")
+                .as_db_error()
                .filter(|db_error| {
                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
                        && db_error.message().contains("ending streaming")
                })
                .is_some()
    }
 }
 impl ExpectedError for anyhow::Error {
    fn is_expected(&self) -> bool {
        let head = self.downcast_ref::<postgres::Error>();
        let tail = self
            .chain()
            .filter_map(|e| e.downcast_ref::<postgres::Error>());
        // check if self or any of the chained/sourced errors are expected
        head.into_iter().chain(tail).any(|e| e.is_expected())
    }
 }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized {
    pub(crate) queued_operations: VecDeque<UploadOp>,
 }
 impl UploadQueueInitialized {
    pub(super) fn no_pending_work(&self) -> bool {
        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
    }
 }
 #[derive(Clone, Copy)]
 pub(super) enum SetDeletedFlagProgress {
    NotRunning,
@@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress {
 }
 pub(super) struct UploadQueueStopped {
-    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(super) upload_queue_for_deletion: UploadQueueInitialized,
    pub(super) last_uploaded_consistent_lsn: Lsn,
    pub(super) latest_metadata: TimelineMetadata,
    pub(super) deleted_at: SetDeletedFlagProgress,
 }
@@ -187,6 +191,15 @@ impl UploadQueue {
            UploadQueue::Initialized(x) => Ok(x),
        }
    }
    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
        match self {
            UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
                anyhow::bail!("queue is in state {}", self.as_str())
            }
            UploadQueue::Stopped(stopped) => Ok(stopped),
        }
    }
 }
 /// An in-progress upload or delete task.
@@ -199,6 +212,13 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }
 #[derive(Debug)]
 pub(crate) struct Delete {
    pub(crate) file_kind: RemoteOpFileKind,
    pub(crate) layer_file_name: LayerFileName,
    pub(crate) scheduled_from_timeline_delete: bool,
 }
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
@@ -207,8 +227,8 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
-    /// Delete a file.
+    /// Delete a layer file
-    Delete(RemoteOpFileKind, LayerFileName),
+    Delete(Delete),
    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
@@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp {
                )
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Delete(delete) => write!(
                f,
                "Delete(path: {}, scheduled_from_timeline_delete: {})",
                delete.layer_file_name.file_name(),
                delete.scheduled_from_timeline_delete
            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
-use anyhow::Result;
+use anyhow::{Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {
        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit()?;
+        modification.commit().await?;
        Ok(())
    }
@@ -1082,7 +1082,10 @@ impl<'a> WalIngest<'a> {
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0, ctx).await?;
+            modification
                .put_rel_creation(rel, 0, ctx)
                .await
                .context("Relation Error")?;
            0
        } else {
            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
@@ -1171,7 +1174,6 @@ impl<'a> WalIngest<'a> {
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::pgdatadir_mapping::create_test_timeline;
    use crate::tenant::harness::*;
    use crate::tenant::Timeline;
    use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1200,7 +1202,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit()?;
+        m.commit().await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
        Ok(walingest)
@@ -1209,7 +1211,9 @@ mod tests {
    #[tokio::test]
    async fn test_relsize() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x20));
@@ -1217,22 +1221,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_current_logical_size(&tline, Lsn(0x50));
@@ -1318,7 +1322,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_current_logical_size(&tline, Lsn(0x60));
        // Check reported size and contents after truncation
@@ -1360,7 +1364,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1373,7 +1377,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1398,7 +1402,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1428,14 +1432,16 @@ mod tests {
    #[tokio::test]
    async fn test_drop_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        // Check that rel exists and size is correct
        assert_eq!(
@@ -1454,7 +1460,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit()?;
+        m.commit().await?;
        // Check that rel is not visible anymore
        assert_eq!(
@@ -1472,7 +1478,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        // Check that rel exists and size is correct
        assert_eq!(
@@ -1497,7 +1503,9 @@ mod tests {
    #[tokio::test]
    async fn test_truncate_extend() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        // Create a 20 MB relation (the size is arbitrary)
@@ -1509,7 +1517,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1554,7 +1562,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        // Check reported size and contents after truncation
        assert_eq!(
@@ -1603,7 +1611,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline
@@ -1637,7 +1645,9 @@ mod tests {
    #[tokio::test]
    async fn test_large_rel() -> Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut walingest = init_walingest_test(&tline, &ctx).await?;
        let mut lsn = 0x10;
@@ -1648,7 +1658,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
        }
        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1664,7 +1674,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -1677,7 +1687,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit()?;
+        m.commit().await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -1693,7 +1703,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit()?;
+            m.commit().await?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/pgxn/hnsw/Makefile
+++ b/pgxn/hnsw/Makefile
@@ -0,0 +1,26 @@
 EXTENSION = hnsw
 EXTVERSION = 0.1.0
 MODULE_big = hnsw
 DATA = $(wildcard *--*.sql)
 OBJS = hnsw.o hnswalg.o
 TESTS = $(wildcard test/sql/*.sql)
 REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
 REGRESS_OPTS = --inputdir=test --load-extension=hnsw
 # For auto-vectorization:
 # - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
 PG_CFLAGS += -O3
 PG_CXXFLAGS +=  -O3 -std=c++11
 PG_LDFLAGS += -lstdc++
 all: $(EXTENSION)--$(EXTVERSION).sql
 PG_CONFIG ?= pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 dist:
 	mkdir -p dist
 	git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
--- a/pgxn/hnsw/README.md
+++ b/pgxn/hnsw/README.md
@@ -0,0 +1,25 @@
 # Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
 This ANN extension of Postgres is based
 on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
 the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
 [Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
 <br>
 Dmitry Baranchuk, Artem Babenko, Yury Malkov
 # Postgres extension
 HNSW index is hold in memory (built on demand) and it's maxial size is limited
 by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
 Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
 described in the article).
 # Example of usage:
 ```
 create extension hnsw;
 create table embeddings(id integer primary key, payload real[]);
 create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
 select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
 ```
--- a/pgxn/hnsw/hnsw--0.1.0.sql
+++ b/pgxn/hnsw/hnsw--0.1.0.sql
@@ -0,0 +1,29 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION hnsw" to load this file. \quit
 -- functions
 CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
 	AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
 -- operators
 CREATE OPERATOR <-> (
 	LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
 	COMMUTATOR = '<->'
 );
 -- access method
 CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
 	AS 'MODULE_PATHNAME' LANGUAGE C;
 CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
 COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
 -- opclasses
 CREATE OPERATOR CLASS knn_ops
 	DEFAULT FOR TYPE real[] USING hnsw AS
 	OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -0,0 +1,551 @@
 #include "postgres.h"
 #include "access/amapi.h"
 #include "access/generic_xlog.h"
 #include "access/relation.h"
 #include "access/reloptions.h"
 #include "access/tableam.h"
 #include "catalog/index.h"
 #include "commands/vacuum.h"
 #include "nodes/execnodes.h"
 #include "storage/bufmgr.h"
 #include "utils/guc.h"
 #include "utils/selfuncs.h"
 #include <math.h>
 #include <float.h>
 #include "hnsw.h"
 PG_MODULE_MAGIC;
 typedef struct {
 	int32 vl_len_;		/* varlena header (do not touch directly!) */
 	int dims;
 	int maxelements;
 	int efConstruction;
 	int efSearch;
 	int M;
 } HnswOptions;
 static relopt_kind hnsw_relopt_kind;
 typedef struct {
 	HierarchicalNSW* hnsw;
 	size_t curr;
 	size_t n_results;
 	ItemPointer results;
 } HnswScanOpaqueData;
 typedef HnswScanOpaqueData* HnswScanOpaque;
 typedef struct {
 	Oid relid;
 	uint32 status;
 	HierarchicalNSW* hnsw;
 } HnswHashEntry;
 #define SH_PREFIX			 hnsw_index
 #define SH_ELEMENT_TYPE		 HnswHashEntry
 #define SH_KEY_TYPE			 Oid
 #define SH_KEY				 relid
 #define SH_STORE_HASH
 #define SH_GET_HASH(tb, a)	 ((a)->relid)
 #define SH_HASH_KEY(tb, key) (key)
 #define SH_EQUAL(tb, a, b)	((a) == (b))
 #define SH_SCOPE			static inline
 #define SH_DEFINE
 #define SH_DECLARE
 #include "lib/simplehash.h"
 #define INDEX_HASH_SIZE     11
 #define DEFAULT_EF_SEARCH   64
 PGDLLEXPORT void _PG_init(void);
 static hnsw_index_hash *hnsw_indexes;
 /*
 * Initialize index options and variables
 */
 void
 _PG_init(void)
 {
 	hnsw_relopt_kind = add_reloption_kind();
 	add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
 					  0, 0, INT_MAX, AccessExclusiveLock);
 	add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
 					  0, 0, INT_MAX, AccessExclusiveLock);
 	add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
 					  100, 0, INT_MAX, AccessExclusiveLock);
 	add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
 					  16, 1, INT_MAX, AccessExclusiveLock);
 	add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
 					  64, 1, INT_MAX, AccessExclusiveLock);
 	hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
 }
 static void
 hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
 					bool *isnull, bool tupleIsAlive, void *state)
 {
 	HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
 	ArrayType* array;
 	int n_items;
 	label_t label = 0;
 	/* Skip nulls */
 	if (isnull[0])
 		return;
 	array = DatumGetArrayTypeP(values[0]);
 	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
 	if (n_items != hnsw_dimensions(hnsw))
 	{
 		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
 			 n_items, hnsw_dimensions(hnsw));
 	}
 	memcpy(&label, tid, sizeof(*tid));
 	hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
 }
 static void
 hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
 {
 	IndexInfo* indexInfo = BuildIndexInfo(indexRel);
 	Assert(indexInfo->ii_NumIndexAttrs == 1);
 	table_index_build_scan(heapRel, indexRel, indexInfo,
 						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
 }
 static HierarchicalNSW*
 hnsw_get_index(Relation indexRel, Relation heapRel)
 {
 	HierarchicalNSW* hnsw;
 	Oid indexoid = RelationGetRelid(indexRel);
 	HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
 	if (entry == NULL)
 	{
 		size_t dims, maxelements;
 		size_t M;
 		size_t maxM;
 		size_t size_links_level0;
 		size_t size_data_per_element;
 		size_t data_size;
 		dsm_handle handle = indexoid << 1; /* make it even */
 		void* impl_private = NULL;
 		void* mapped_address = NULL;
 		Size  mapped_size = 0;
 		Size  shmem_size;
 		bool exists = true;
 		bool found;
 		HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
 		if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
 			elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
 		}
 		dims = opts->dims;
 		maxelements = opts->maxelements;
 		M = opts->M;
 		maxM = M * 2;
 		data_size = dims * sizeof(coord_t);
 		size_links_level0 = (maxM + 1) * sizeof(idx_t);
 		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
 		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
 		/* first try to attach to existed index */
 		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
 						 &mapped_address, &mapped_size, DEBUG1))
 		{
 			/* index doesn't exists: try to create it */
 			if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
 							 &mapped_address, &mapped_size, DEBUG1))
 			{
 				/* We can do it under shared lock, so some other backend may
 				 * try to initialize index. If create is failed because index already
 				 * created by somebody else, then try to attach to it once again
 				 */
 				if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
 								 &mapped_address, &mapped_size, ERROR))
 				{
 					return NULL;
 				}
 			}
 			else
 			{
 				exists = false;
 			}
 		}
 		Assert(mapped_size == shmem_size);
 		hnsw = (HierarchicalNSW*)mapped_address;
 		if (!exists)
 		{
 			hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
 			hnsw_populate(hnsw, indexRel, heapRel);
 		}
 		entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
 		Assert(!found);
 		entry->hnsw = hnsw;
 	}
 	else
 	{
 		hnsw = entry->hnsw;
 	}
 	return hnsw;
 }
 /*
 * Start or restart an index scan
 */
 static IndexScanDesc
 hnsw_beginscan(Relation index, int nkeys, int norderbys)
 {
 	IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
 	HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
 	Relation heap = relation_open(index->rd_index->indrelid, NoLock);
 	so->hnsw = hnsw_get_index(index, heap);
 	relation_close(heap, NoLock);
 	so->curr = 0;
 	so->n_results = 0;
 	so->results = NULL;
 	scan->opaque = so;
 	return scan;
 }
 /*
 * Start or restart an index scan
 */
 static void
 hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
 {
 	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
 	if (so->results)
 	{
 		pfree(so->results);
 		so->results = NULL;
 	}
 	so->curr = 0;
 	if (orderbys && scan->numberOfOrderBys > 0)
 		memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
 }
 /*
 * Fetch the next tuple in the given scan
 */
 static bool
 hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
 {
 	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
 	/*
 	 * Index can be used to scan backward, but Postgres doesn't support
 	 * backward scan on operators
 	 */
 	Assert(ScanDirectionIsForward(dir));
 	if (so->curr == 0)
 	{
 		Datum		value;
 		ArrayType*	array;
 		int         n_items;
 		size_t      n_results;
 		label_t*    results;
 		HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
 		size_t      efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
 		/* Safety check */
 		if (scan->orderByData == NULL)
 			elog(ERROR, "cannot scan HNSW index without order");
 		/* No items will match if null */
 		if (scan->orderByData->sk_flags & SK_ISNULL)
 			return false;
 		value = scan->orderByData->sk_argument;
 		array = DatumGetArrayTypeP(value);
 		n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
 		if (n_items != hnsw_dimensions(so->hnsw))
 		{
 			elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
 				 n_items, hnsw_dimensions(so->hnsw));
 		}
 		if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
 			elog(ERROR, "HNSW index search failed");
 		so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
 		so->n_results = n_results;
 		for (size_t i = 0; i < n_results; i++)
 		{
 			memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
 		}
 		free(results);
 	}
 	if (so->curr >= so->n_results)
 	{
 		return false;
 	}
 	else
 	{
 		scan->xs_heaptid = so->results[so->curr++];
 		scan->xs_recheckorderby = false;
 		return true;
 	}
 }
 /*
 * End a scan and release resources
 */
 static void
 hnsw_endscan(IndexScanDesc scan)
 {
 	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
 	if (so->results)
 		pfree(so->results);
 	pfree(so);
 	scan->opaque = NULL;
 }
 /*
 * Estimate the cost of an index scan
 */
 static void
 hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 				 Cost *indexStartupCost, Cost *indexTotalCost,
 				 Selectivity *indexSelectivity, double *indexCorrelation
 				 ,double *indexPages
 )
 {
 	GenericCosts costs;
 	/* Never use index without order */
 	if (path->indexorderbys == NULL)
 	{
 		*indexStartupCost = DBL_MAX;
 		*indexTotalCost = DBL_MAX;
 		*indexSelectivity = 0;
 		*indexCorrelation = 0;
 		*indexPages = 0;
 		return;
 	}
 	MemSet(&costs, 0, sizeof(costs));
 	genericcostestimate(root, path, loop_count, &costs);
 	/* Startup cost and total cost are same */
 	*indexStartupCost = costs.indexTotalCost;
 	*indexTotalCost = costs.indexTotalCost;
 	*indexSelectivity = costs.indexSelectivity;
 	*indexCorrelation = costs.indexCorrelation;
 	*indexPages = costs.numIndexPages;
 }
 /*
 * Parse and validate the reloptions
 */
 static bytea *
 hnsw_options(Datum reloptions, bool validate)
 {
 	static const relopt_parse_elt tab[] = {
 		{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
 		{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
 		{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
 		{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
 		{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
 	};
 	return (bytea *) build_reloptions(reloptions, validate,
 									  hnsw_relopt_kind,
 									  sizeof(HnswOptions),
 									  tab, lengthof(tab));
 }
 /*
 * Validate catalog entries for the specified operator class
 */
 static bool
 hnsw_validate(Oid opclassoid)
 {
 	return true;
 }
 /*
 * Build the index for a logged table
 */
 static IndexBuildResult *
 hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
 {
 	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
 	IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
 	result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
 	return result;
 }
 /*
 * Insert a tuple into the index
 */
 static bool
 hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
 			  Relation heap, IndexUniqueCheck checkUnique,
 			  bool indexUnchanged,
 			  IndexInfo *indexInfo)
 {
 	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
 	Datum value;
 	ArrayType* array;
 	int n_items;
 	label_t label = 0;
 	/* Skip nulls */
 	if (isnull[0])
 		return false;
 	/* Detoast value */
 	value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
 	array = DatumGetArrayTypeP(value);
 	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
 	if (n_items != hnsw_dimensions(hnsw))
 	{
 		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
 			 n_items, hnsw_dimensions(hnsw));
 	}
 	memcpy(&label, heap_tid, sizeof(*heap_tid));
 	if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
 		elog(ERROR, "HNSW index insert failed");
 	return true;
 }
 /*
 * Build the index for an unlogged table
 */
 static void
 hnsw_buildempty(Relation index)
 {
 	/* index will be constructed on dema nd when accessed */
 }
 /*
 * Clean up after a VACUUM operation
 */
 static IndexBulkDeleteResult *
 hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 {
 	Relation	rel = info->index;
 	if (stats == NULL)
 		return NULL;
 	stats->num_pages = RelationGetNumberOfBlocks(rel);
 	return stats;
 }
 /*
 * Bulk delete tuples from the index
 */
 static IndexBulkDeleteResult *
 hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 				IndexBulkDeleteCallback callback, void *callback_state)
 {
 	if (stats == NULL)
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 	return stats;
 }
 /*
 * Define index handler
 *
 * See https://www.postgresql.org/docs/current/index-api.html
 */
 PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
 Datum
 hnsw_handler(PG_FUNCTION_ARGS)
 {
 	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
 	amroutine->amstrategies = 0;
 	amroutine->amsupport = 0;
 	amroutine->amoptsprocnum = 0;
 	amroutine->amcanorder = false;
 	amroutine->amcanorderbyop = true;
 	amroutine->amcanbackward = false;	/* can change direction mid-scan */
 	amroutine->amcanunique = false;
 	amroutine->amcanmulticol = false;
 	amroutine->amoptionalkey = true;
 	amroutine->amsearcharray = false;
 	amroutine->amsearchnulls = false;
 	amroutine->amstorage = false;
 	amroutine->amclusterable = false;
 	amroutine->ampredlocks = false;
 	amroutine->amcanparallel = false;
 	amroutine->amcaninclude = false;
 	amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
 	amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
 	amroutine->amkeytype = InvalidOid;
 	/* Interface functions */
 	amroutine->ambuild = hnsw_build;
 	amroutine->ambuildempty = hnsw_buildempty;
 	amroutine->aminsert = hnsw_insert;
 	amroutine->ambulkdelete = hnsw_bulkdelete;
 	amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
 	amroutine->amcanreturn = NULL;	/* tuple not included in heapsort */
 	amroutine->amcostestimate = hnsw_costestimate;
 	amroutine->amoptions = hnsw_options;
 	amroutine->amproperty = NULL;	/* TODO AMPROP_DISTANCE_ORDERABLE */
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = hnsw_validate;
 	amroutine->amadjustmembers = NULL;
 	amroutine->ambeginscan = hnsw_beginscan;
 	amroutine->amrescan = hnsw_rescan;
 	amroutine->amgettuple = hnsw_gettuple;
 	amroutine->amgetbitmap = NULL;
 	amroutine->amendscan = hnsw_endscan;
 	amroutine->ammarkpos = NULL;
 	amroutine->amrestrpos = NULL;
 	/* Interface functions to support parallel index scans */
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
 	PG_RETURN_POINTER(amroutine);
 }
 /*
 * Get the L2 distance between vectors
 */
 PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
 Datum
 l2_distance(PG_FUNCTION_ARGS)
 {
 	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
 	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
 	int         a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
 	int         b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
 	dist_t 		distance = 0.0;
 	dist_t		diff;
 	coord_t	   *ax = (coord_t*)ARR_DATA_PTR(a);
 	coord_t	   *bx = (coord_t*)ARR_DATA_PTR(b);
 	if (a_dim != b_dim)
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_DATA_EXCEPTION),
 				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
 	}
 	for (int i = 0; i < a_dim; i++)
 	{
 		diff = ax[i] - bx[i];
 		distance += diff * diff;
 	}
 	PG_RETURN_FLOAT4((dist_t)sqrt(distance));
 }
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -0,0 +1,5 @@
 comment = 'hNsw index'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true
 trusted = true
--- a/pgxn/hnsw/hnsw.h
+++ b/pgxn/hnsw/hnsw.h
@@ -0,0 +1,15 @@
 #pragma once
 typedef float    coord_t;
 typedef float    dist_t;
 typedef uint32_t idx_t;
 typedef uint64_t label_t;
 typedef struct HierarchicalNSW HierarchicalNSW;
 bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
 bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
 void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
 int  hnsw_dimensions(HierarchicalNSW* hnsw);
 size_t hnsw_count(HierarchicalNSW* hnsw);
 size_t hnsw_sizeof(void);
--- a/pgxn/hnsw/hnswalg.cpp
+++ b/pgxn/hnsw/hnswalg.cpp
@@ -0,0 +1,379 @@
 #include "hnswalg.h"
 #if defined(__GNUC__)
 #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
 #define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
 #else
 #define PORTABLE_ALIGN32 __declspec(align(32))
 #define PREFETCH(addr,hint)
 #endif
 HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
 {
    dim = dim_;
    data_size = dim * sizeof(coord_t);
    efConstruction = efConstruction_;
    maxelements = maxelements_;
    M = M_;
    maxM = maxM_;
    size_links_level0 = (maxM + 1) * sizeof(idx_t);
    size_data_per_element = size_links_level0 + data_size  + sizeof(label_t);
    offset_data = size_links_level0;
 	offset_label = offset_data + data_size;
    enterpoint_node = 0;
    cur_element_count = 0;
 #ifdef __x86_64__
    use_avx2 = __builtin_cpu_supports("avx2");
 #endif
 }
 std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
 {
 	std::vector<uint32_t> visited;
 	visited.resize((cur_element_count + 31) >> 5);
    std::priority_queue<std::pair<dist_t, idx_t >> topResults;
    std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
    dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
    topResults.emplace(dist, enterpoint_node);
    candidateSet.emplace(-dist, enterpoint_node);
    visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
    dist_t lowerBound = dist;
    while (!candidateSet.empty())
    {
        std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
        if (-curr_el_pair.first > lowerBound)
            break;
        candidateSet.pop();
        idx_t curNodeNum = curr_el_pair.second;
        idx_t* data = get_linklist0(curNodeNum);
        size_t size = *data++;
        PREFETCH(getDataByInternalId(*data), 0);
        for (size_t j = 0; j < size; ++j) {
            size_t tnum = *(data + j);
            PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
            if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
 				visited[tnum >> 5] |= 1 << (tnum & 31);
                dist = fstdistfunc(point, getDataByInternalId(tnum));
                if (topResults.top().first > dist || topResults.size() < ef) {
                    candidateSet.emplace(-dist, tnum);
                    PREFETCH(get_linklist0(candidateSet.top().second), 0);
                    topResults.emplace(dist, tnum);
                    if (topResults.size() > ef)
                        topResults.pop();
                    lowerBound = topResults.top().first;
                }
            }
        }
    }
    return topResults;
 }
 void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
 {
    if (topResults.size() < NN)
        return;
    std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
    std::vector<std::pair<dist_t, idx_t>> returnlist;
    while (topResults.size() > 0) {
        resultSet.emplace(-topResults.top().first, topResults.top().second);
        topResults.pop();
    }
    while (resultSet.size()) {
        if (returnlist.size() >= NN)
            break;
        std::pair<dist_t, idx_t> curen = resultSet.top();
        dist_t dist_to_query = -curen.first;
        resultSet.pop();
        bool good = true;
        for (std::pair<dist_t, idx_t> curen2 : returnlist) {
            dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
                                         getDataByInternalId(curen.second));
            if (curdist < dist_to_query) {
                good = false;
                break;
            }
        }
        if (good) returnlist.push_back(curen);
    }
    for (std::pair<dist_t, idx_t> elem : returnlist)
        topResults.emplace(-elem.first, elem.second);
 }
 void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
                               std::priority_queue<std::pair<dist_t, idx_t>> topResults)
 {
    getNeighborsByHeuristic(topResults, M);
    std::vector<idx_t> res;
    res.reserve(M);
    while (topResults.size() > 0) {
        res.push_back(topResults.top().second);
        topResults.pop();
    }
    {
        idx_t* data = get_linklist0(cur_c);
        if (*data)
            throw std::runtime_error("Should be blank");
        *data++ = res.size();
        for (size_t idx = 0; idx < res.size(); idx++) {
            if (data[idx])
                throw std::runtime_error("Should be blank");
            data[idx] = res[idx];
        }
    }
    for (size_t idx = 0; idx < res.size(); idx++) {
        if (res[idx] == cur_c)
            throw std::runtime_error("Connection to the same element");
        size_t resMmax = maxM;
        idx_t *ll_other = get_linklist0(res[idx]);
        idx_t sz_link_list_other = *ll_other;
        if (sz_link_list_other > resMmax || sz_link_list_other < 0)
            throw std::runtime_error("Bad sz_link_list_other");
        if (sz_link_list_other < resMmax) {
            idx_t *data = ll_other + 1;
            data[sz_link_list_other] = cur_c;
            *ll_other = sz_link_list_other + 1;
        } else {
            // finding the "weakest" element to replace it with the new one
            idx_t *data = ll_other + 1;
            dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
            // Heuristic:
            std::priority_queue<std::pair<dist_t, idx_t>> candidates;
            candidates.emplace(d_max, cur_c);
            for (size_t j = 0; j < sz_link_list_other; j++)
                candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
            getNeighborsByHeuristic(candidates, resMmax);
            size_t indx = 0;
            while (!candidates.empty()) {
                data[indx] = candidates.top().second;
                candidates.pop();
                indx++;
            }
            *ll_other = indx;
        }
    }
 }
 void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
 {
    if (cur_element_count >= maxelements) {
        throw std::runtime_error("The number of elements exceeds the specified limit");
    }
    idx_t cur_c = cur_element_count++;
    memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
    memcpy(getDataByInternalId(cur_c), point, data_size);
    memcpy(getExternalLabel(cur_c), &label, sizeof label);
    // Do nothing for the first element
    if (cur_c != 0) {
        std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
        mutuallyConnectNewElement(point, cur_c, topResults);
    }
 };
 std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
 {
 	std::priority_queue<std::pair<dist_t, label_t>> topResults;
 	auto topCandidates = searchBaseLayer(query, k);
    while (topCandidates.size() > k) {
        topCandidates.pop();
 	}
 	while (!topCandidates.empty()) {
 		std::pair<dist_t, idx_t> rez = topCandidates.top();
 		label_t label;
 		memcpy(&label, getExternalLabel(rez.second), sizeof(label));
 		topResults.push(std::pair<dist_t, label_t>(rez.first, label));
 		topCandidates.pop();
 	}
    return topResults;
 };
 dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
 {
    dist_t 	distance = 0.0;
    for (size_t i = 0; i < n; i++)
    {
        dist_t diff = x[i] - y[i];
        distance += diff * diff;
    }
    return distance;
 }
 #ifdef __x86_64__
 #include <immintrin.h>
 __attribute__((target("avx2")))
 dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
 {
    const size_t TmpResSz = sizeof(__m256) / sizeof(float);
    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
    size_t qty16 = n / 16;
    const float *pEnd1 = x + (qty16 * 16);
    __m256 diff, v1, v2;
    __m256 sum = _mm256_set1_ps(0);
    while (x < pEnd1) {
        v1 = _mm256_loadu_ps(x);
        x += 8;
        v2 = _mm256_loadu_ps(y);
        y += 8;
        diff = _mm256_sub_ps(v1, v2);
        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
        v1 = _mm256_loadu_ps(x);
        x += 8;
        v2 = _mm256_loadu_ps(y);
        y += 8;
        diff = _mm256_sub_ps(v1, v2);
        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
    }
    _mm256_store_ps(TmpRes, sum);
    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
    return (res);
 }
 dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
 {
    const size_t TmpResSz = sizeof(__m128) / sizeof(float);
    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
    size_t qty16 = n / 16;
    const float *pEnd1 = x + (qty16 * 16);
    __m128 diff, v1, v2;
    __m128 sum = _mm_set1_ps(0);
    while (x < pEnd1) {
        v1 = _mm_loadu_ps(x);
        x += 4;
        v2 = _mm_loadu_ps(y);
        y += 4;
        diff = _mm_sub_ps(v1, v2);
        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
        v1 = _mm_loadu_ps(x);
        x += 4;
        v2 = _mm_loadu_ps(y);
        y += 4;
        diff = _mm_sub_ps(v1, v2);
        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
        v1 = _mm_loadu_ps(x);
        x += 4;
        v2 = _mm_loadu_ps(y);
        y += 4;
        diff = _mm_sub_ps(v1, v2);
        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
        v1 = _mm_loadu_ps(x);
        x += 4;
        v2 = _mm_loadu_ps(y);
        y += 4;
        diff = _mm_sub_ps(v1, v2);
        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
    }
    _mm_store_ps(TmpRes, sum);
    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
    return res;
 }
 #endif
 dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
 {
 #ifndef __x86_64__
    return fstdistfunc_scalar(x, y, dim);
 #else
    if(use_avx2)
        return fstdistfunc_avx2(x, y, dim);
    return fstdistfunc_sse(x, y, dim);
 #endif
 }
 bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
 {
 	try
 	{
 		auto result = hnsw->searchKnn(point, efSearch);
 		size_t nResults = result.size();
 		*results = (label_t*)malloc(nResults*sizeof(label_t));
 		for (size_t i = nResults; i-- != 0;)
 		{
 			(*results)[i] = result.top().second;
 			result.pop();
 		}
 		*n_results = nResults;
 		return true;
 	}
 	catch (std::exception& x)
 	{
 		return false;
 	}
 }
 bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
 {
 	try
 	{
 		hnsw->addPoint(point, label);
 		return true;
 	}
 	catch (std::exception& x)
 	{
 		fprintf(stderr, "Catch %s\n", x.what());
 		return false;
 	}
 }
 void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
 {
 	new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
 }
 int hnsw_dimensions(HierarchicalNSW* hnsw)
 {
 	return (int)hnsw->dim;
 }
 size_t hnsw_count(HierarchicalNSW* hnsw)
 {
 	return hnsw->cur_element_count;
 }
 size_t hnsw_sizeof(void)
 {
 	return sizeof(HierarchicalNSW);
 }
--- a/pgxn/hnsw/hnswalg.h
+++ b/pgxn/hnsw/hnswalg.h
@@ -0,0 +1,69 @@
 #pragma once
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <map>
 #include <cmath>
 #include <queue>
 #include <stdexcept>
 extern "C" {
 #include "hnsw.h"
 }
 struct HierarchicalNSW
 {
 	size_t maxelements;
 	size_t cur_element_count;
 	idx_t  enterpoint_node;
 	size_t dim;
 	size_t data_size;
 	size_t offset_data;
 	size_t offset_label;
 	size_t size_data_per_element;
 	size_t M;
 	size_t maxM;
 	size_t size_links_level0;
 	size_t efConstruction;
 #ifdef __x86_64__
 	bool	use_avx2;
 #endif
 	char   data_level0_memory[0]; // varying size
  public:
 	HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
 	~HierarchicalNSW();
 	inline coord_t *getDataByInternalId(idx_t internal_id) const {
 		return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
 	}
 	inline idx_t *get_linklist0(idx_t internal_id) const {
 		return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
 	}
 	inline label_t *getExternalLabel(idx_t internal_id) const {
 		return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
 	}
 	std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
 	void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
 	void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
 	void addPoint(const coord_t *point, label_t label);
 	std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
 	dist_t fstdistfunc(const coord_t *x, const coord_t *y);
 };
--- a/pgxn/hnsw/test/expected/knn.out
+++ b/pgxn/hnsw/test/expected/knn.out
@@ -0,0 +1,28 @@
 SET enable_seqscan = off;
 CREATE TABLE t (val real[]);
 INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
 CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
 INSERT INTO t (val) VALUES (array[1,2,4]);
 explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
                             QUERY PLAN                             
 --------------------------------------------------------------------
 Index Scan using t_val_idx on t  (cost=4.02..8.06 rows=3 width=36)
   Order By: (val <-> '{3,3,3}'::real[])
 (2 rows)
 SELECT * FROM t ORDER BY val <-> array[3,3,3];
   val   
 ---------
 {1,2,3}
 {1,2,4}
 {1,1,1}
 {0,0,0}
 (4 rows)
 SELECT COUNT(*) FROM t;
 count 
 -------
     5
 (1 row)
 DROP TABLE t;
--- a/pgxn/hnsw/test/sql/knn.sql
+++ b/pgxn/hnsw/test/sql/knn.sql
@@ -0,0 +1,13 @@
 SET enable_seqscan = off;
 CREATE TABLE t (val real[]);
 INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
 CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
 INSERT INTO t (val) VALUES (array[1,2,4]);
 explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
 SELECT * FROM t ORDER BY val <-> array[3,3,3];
 SELECT COUNT(*) FROM t;
 DROP TABLE t;
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -254,7 +254,7 @@ nwp_register_gucs(void)
 	DefineCustomIntVariable(
 							"neon.safekeeper_reconnect_timeout",
-							"Timeout for reconnecting to offline wal acceptor.",
+							"Walproposer reconnects to offline safekeepers once in this interval.",
 							NULL,
 							&wal_acceptor_reconnect_timeout,
 							1000, 0, INT_MAX,	/* default, min, max */
@@ -264,10 +264,10 @@ nwp_register_gucs(void)
 	DefineCustomIntVariable(
 							"neon.safekeeper_connect_timeout",
-							"Timeout for connection establishement and it's maintenance against safekeeper",
+							"Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.",
 							NULL,
 							&wal_acceptor_connection_timeout,
-							5000, 0, INT_MAX,
+							10000, 0, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
@@ -441,7 +441,7 @@ WalProposerPoll(void)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wal_acceptor_connection_timeout))
 				{
-					elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
+					elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
 						 sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
 					ShutdownConnection(sk);
 				}
@@ -1035,9 +1035,16 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse))
 		return;
 	elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
 	/* 
 	 * Note: it would be better to track the counter on per safekeeper basis,
 	 * but at worst walproposer would restart with 'term rejected', so leave as
 	 * is for now.
 	 */
 	++n_connected;
 	if (n_connected <= quorum)
 	{
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 [[package]]
 name = "aiohttp"
@@ -855,35 +855,31 @@ files = [
 [[package]]
 name = "cryptography"
-version = "39.0.1"
+version = "41.0.0"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
+    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"},
-    {file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
+    {file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
+    {file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
+    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"},
-    {file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
+    {file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"},
-    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
+    {file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"},
-    {file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
+    {file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"},
-    {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"},
-    {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
+    {file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"},
-    {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
+    {file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"},
-    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
+    {file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
    {file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
 ]
 [package.dependencies]
@@ -892,12 +888,12 @@ cffi = ">=1.12"
 [package.extras]
 docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
-pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
+nox = ["nox"]
-sdist = ["setuptools-rust (>=0.11.4)"]
+pep8test = ["black", "check-sdist", "mypy", "ruff"]
 sdist = ["build"]
 ssh = ["bcrypt (>=3.1.5)"]
-test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
+test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 tox = ["tox"]
 [[package]]
 name = "docker"
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -93,6 +93,15 @@ With the current approach we made the following design decisions:
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.
 ### Output options
 User can pass several optional headers that will affect resulting json.
 1. `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres.
 2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
 cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
 ## Using SNI-based routing on localhost
 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -1,5 +1,5 @@
-///! Various stuff for dealing with the Neon Console.
+//! Various stuff for dealing with the Neon Console.
-///! Later we might move some API wrappers here.
+//! Later we might move some API wrappers here.
 /// Payloads used in the console's APIs.
 pub mod messages;
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,6 +1,9 @@
 use futures::pin_mut;
 use futures::StreamExt;
 use futures::TryFutureExt;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
 use pq_proto::StartupMessageParams;
 use serde_json::json;
@@ -9,8 +12,13 @@ use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::Row;
 use tracing::error;
 use tracing::info;
 use tracing::instrument;
 use url::Url;
 use crate::proxy::invalidate_cache;
 use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
 use crate::{auth, config::ProxyConfig, console};
 #[derive(serde::Deserialize)]
@@ -23,21 +31,28 @@ const APP_NAME: &str = "sql_over_http";
 const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
    json.iter()
        .map(|value| {
            match value {
-                Value::Null => serde_json::to_string(value),
+                // special care for nulls
-                Value::Bool(_) => serde_json::to_string(value),
+                Value::Null => Ok(None),
                Value::Number(_) => serde_json::to_string(value),
                Value::Object(_) => serde_json::to_string(value),
-                // no need to escape
+                // convert to text with escaping
-                Value::String(s) => Ok(s.to_string()),
+                Value::Bool(_) => serde_json::to_string(value).map(Some),
                Value::Number(_) => serde_json::to_string(value).map(Some),
                Value::Object(_) => serde_json::to_string(value).map(Some),
                // avoid escaping here, as we pass this as a parameter
                Value::String(s) => Ok(Some(s.to_string())),
                // special care for arrays
                Value::Array(_) => json_array_to_pg_array(value),
@@ -54,33 +69,44 @@ fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
 //
 // Example of the same escaping in node-postgres: packages/pg/lib/utils.js
 //
-fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::Error> {
    match value {
-        // same
+        // special care for nulls
-        Value::Null => serde_json::to_string(value),
+        Value::Null => Ok(None),
        Value::Bool(_) => serde_json::to_string(value),
        Value::Number(_) => serde_json::to_string(value),
        Value::Object(_) => serde_json::to_string(value),
-        // now needs to be escaped, as it is part of the array
+        // convert to text with escaping
-        Value::String(_) => serde_json::to_string(value),
+        Value::Bool(_) => serde_json::to_string(value).map(Some),
        Value::Number(_) => serde_json::to_string(value).map(Some),
        Value::Object(_) => serde_json::to_string(value).map(Some),
        // here string needs to be escaped, as it is part of the array
        Value::String(_) => serde_json::to_string(value).map(Some),
        // recurse into array
        Value::Array(arr) => {
            let vals = arr
                .iter()
                .map(json_array_to_pg_array)
                .map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string())))
                .collect::<Result<Vec<_>, _>>()?
                .join(",");
-            Ok(format!("{{{}}}", vals))
+
            Ok(Some(format!("{{{}}}", vals)))
        }
    }
 }
 struct ConnInfo {
    username: String,
    dbname: String,
    hostname: String,
    password: String,
 }
 fn get_conn_info(
    headers: &HeaderMap,
    sni_hostname: Option<String>,
-) -> Result<(String, String, String, String), anyhow::Error> {
+) -> Result<ConnInfo, anyhow::Error> {
    let connection_string = headers
        .get("Neon-Connection-String")
        .ok_or(anyhow::anyhow!("missing connection string"))?
@@ -133,12 +159,12 @@ fn get_conn_info(
        }
    }
-    Ok((
+    Ok(ConnInfo {
-        username.to_owned(),
+        username: username.to_owned(),
-        dbname.to_owned(),
+        dbname: dbname.to_owned(),
-        hostname.to_owned(),
+        hostname: hostname.to_owned(),
-        password.to_owned(),
+        password: password.to_owned(),
-    ))
+    })
 }
 // TODO: return different http error codes
@@ -151,13 +177,18 @@ pub async fn handle(
    // Determine the destination and connection params
    //
    let headers = request.headers();
-    let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
+    let conn_info = get_conn_info(headers, sni_hostname)?;
    let credential_params = StartupMessageParams::new([
-        ("user", &username),
+        ("user", &conn_info.username),
-        ("database", &dbname),
+        ("database", &conn_info.dbname),
        ("application_name", APP_NAME),
    ]);
    // Determine the output options. Default behaviour is 'false'. Anything that is not
    // strictly 'true' assumed to be false.
    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
    //
    // Wake up the destination if needed. Code here is a bit involved because
    // we reuse the code from the usual proxy and we need to prepare few structures
@@ -168,21 +199,20 @@ pub async fn handle(
    let creds = config
        .auth_backend
        .as_ref()
-        .map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
+        .map(|_| {
            auth::ClientCredentials::parse(
                &credential_params,
                Some(&conn_info.hostname),
                common_names,
            )
        })
        .transpose()?;
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some(APP_NAME),
    };
-    let node = creds.wake_compute(&extra).await?.expect("msg");
+
-    let conf = node.value.config;
+    let mut node_info = creds.wake_compute(&extra).await?.expect("msg");
    let port = *conf.get_ports().first().expect("no port");
    let host = match conf.get_hosts().first().expect("no host") {
        tokio_postgres::config::Host::Tcp(host) => host,
        tokio_postgres::config::Host::Unix(_) => {
            return Err(anyhow::anyhow!("unix socket is not supported"));
        }
    };
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
@@ -202,28 +232,10 @@ pub async fn handle(
    let QueryData { query, params } = serde_json::from_slice(&body)?;
    let query_params = json_to_pg_text(params)?;
    //
    // Connenct to the destination
    //
    let (client, connection) = tokio_postgres::Config::new()
        .host(host)
        .port(port)
        .user(&username)
        .password(&password)
        .dbname(&dbname)
        .max_backend_message_size(MAX_RESPONSE_SIZE)
        .connect(tokio_postgres::NoTls)
        .await?;
    tokio::spawn(async move {
        if let Err(e) = connection.await {
            eprintln!("connection error: {}", e);
        }
    });
    //
    // Now execute the query and return the result
    //
    let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?;
    let row_stream = client.query_raw_txt(query, query_params).await?;
    // Manually drain the stream into a vector to leave row_stream hanging
@@ -262,6 +274,11 @@ pub async fn handle(
                json!({
                    "name": Value::String(c.name().to_owned()),
                    "dataTypeID": Value::Number(c.type_().oid().into()),
                    "tableID": c.table_oid(),
                    "columnID": c.column_id(),
                    "dataTypeSize": c.type_size(),
                    "dataTypeModifier": c.type_modifier(),
                    "format": "text",
                })
            })
            .collect::<Vec<_>>()
@@ -272,7 +289,7 @@ pub async fn handle(
    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(pg_text_row_to_json)
+        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;
    // resulting JSON format is based on the format of node-postgres result
@@ -281,26 +298,106 @@ pub async fn handle(
        "rowCount": command_tag_count,
        "rows": rows,
        "fields": fields,
        "rowAsArray": array_mode,
    }))
 }
 /// This function is a copy of `connect_to_compute` from `src/proxy.rs` with
 /// the difference that it uses `tokio_postgres` for the connection.
 #[instrument(skip_all)]
 async fn connect_to_compute(
    node_info: &mut console::CachedNodeInfo,
    extra: &console::ConsoleReqExtra<'_>,
    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
    conn_info: &ConnInfo,
 ) -> anyhow::Result<tokio_postgres::Client> {
    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
    loop {
        match connect_to_compute_once(node_info, conn_info).await {
            Err(e) if num_retries > 0 => {
                info!("compute node's state has changed; requesting a wake-up");
                match creds.wake_compute(extra).await? {
                    // Update `node_info` and try one more time.
                    Some(new) => {
                        *node_info = new;
                    }
                    // Link auth doesn't work that way, so we just exit.
                    None => return Err(e),
                }
            }
            other => return other,
        }
        num_retries -= 1;
        info!("retrying after wake-up ({num_retries} attempts left)");
    }
 }
 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
 ) -> anyhow::Result<tokio_postgres::Client> {
    let mut config = (*node_info.config).clone();
    let (client, connection) = config
        .user(&conn_info.username)
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
        .max_backend_message_size(MAX_RESPONSE_SIZE)
        .connect(tokio_postgres::NoTls)
        .inspect_err(|e: &tokio_postgres::Error| {
            error!(
                "failed to connect to compute node hosts={:?} ports={:?}: {}",
                node_info.config.get_hosts(),
                node_info.config.get_ports(),
                e
            );
            invalidate_cache(node_info)
        })
        .await?;
    tokio::spawn(async move {
        if let Err(e) = connection.await {
            error!("connection error: {}", e);
        }
    });
    Ok(client)
 }
 //
 // Convert postgres row with text-encoded values to JSON object
 //
-pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
+pub fn pg_text_row_to_json(
-    let res = row
+    row: &Row,
-        .columns()
+    raw_output: bool,
-        .iter()
+    array_mode: bool,
-        .enumerate()
+) -> Result<Value, anyhow::Error> {
-        .map(|(i, column)| {
+    let iter = row.columns().iter().enumerate().map(|(i, column)| {
-            let name = column.name();
+        let name = column.name();
-            let pg_value = row.as_text(i)?;
+        let pg_value = row.as_text(i)?;
-            let json_value = pg_text_to_json(pg_value, column.type_())?;
+        let json_value = if raw_output {
-            Ok((name.to_string(), json_value))
+            match pg_value {
-        })
+                Some(v) => Value::String(v.to_string()),
-        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+                None => Value::Null,
            }
        } else {
            pg_text_to_json(pg_value, column.type_())?
        };
        Ok((name.to_string(), json_value))
    });
-    Ok(Value::Object(res))
+    if array_mode {
        // drop keys and aggregate into array
        let arr = iter
            .map(|r| r.map(|(_key, val)| val))
            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
        Ok(Value::Array(arr))
    } else {
        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
        Ok(Value::Object(obj))
    }
 }
 //
@@ -308,10 +405,6 @@ pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
 //
 pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
    if let Some(val) = pg_value {
        if val == "NULL" {
            return Ok(Value::Null);
        }
        if let Kind::Array(elem_type) = pg_type.kind() {
            return pg_array_parse(val, elem_type);
        }
@@ -373,6 +466,27 @@ fn _pg_array_parse(
        }
    }
    fn push_checked(
        entry: &mut String,
        entries: &mut Vec<Value>,
        elem_type: &Type,
    ) -> Result<(), anyhow::Error> {
        if !entry.is_empty() {
            // While in usual postgres response we get nulls as None and everything else
            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
            // here while we have quotation info and convert them to None.
            if entry == "NULL" {
                entries.push(pg_text_to_json(None, elem_type)?);
            } else {
                entries.push(pg_text_to_json(Some(entry), elem_type)?);
            }
            entry.clear();
        }
        Ok(())
    }
    while let Some((mut i, mut c)) = pg_array_chr.next() {
        let mut escaped = false;
@@ -395,9 +509,7 @@ fn _pg_array_parse(
            '}' => {
                level -= 1;
                if level == 0 {
-                    if !entry.is_empty() {
+                    push_checked(&mut entry, &mut entries, elem_type)?;
                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
                    }
                    if nested {
                        return Ok((Value::Array(entries), i));
                    }
@@ -405,17 +517,15 @@ fn _pg_array_parse(
            }
            '"' if !escaped => {
                if quote {
-                    // push even if empty
+                    // end of quoted string, so push it manually without any checks
                    // for emptiness or nulls
                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry = String::new();
+                    entry.clear();
                }
                quote = !quote;
            }
            ',' if !quote => {
-                if !entry.is_empty() {
+                push_checked(&mut entry, &mut entries, elem_type)?;
                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
                    entry = String::new();
                }
            }
            _ => {
                entry.push(c);
@@ -439,30 +549,35 @@ mod tests {
    fn test_atomic_types_to_pg_params() {
        let json = vec![Value::Bool(true), Value::Bool(false)];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["true", "false"]);
+        assert_eq!(
            pg_params,
            vec![Some("true".to_owned()), Some("false".to_owned())]
        );
        let json = vec![Value::Number(serde_json::Number::from(42))];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["42"]);
+        assert_eq!(pg_params, vec![Some("42".to_owned())]);
        let json = vec![Value::String("foo\"".to_string())];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["foo\""]);
+        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
        let json = vec![Value::Null];
        let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["null"]);
+        assert_eq!(pg_params, vec![None]);
    }
    #[test]
    fn test_json_array_to_pg_array() {
        // atoms and escaping
-        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
        let json: Value = serde_json::from_str(json).unwrap();
        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
-            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+            vec![Some(
                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
            )]
        );
        // nested arrays
@@ -471,7 +586,9 @@ mod tests {
        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
-            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+            vec![Some(
                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
            )]
        );
    }
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -26,7 +26,6 @@ use tls_listener::TlsListener;
 use tokio::{
    io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
    net::TcpListener,
    select,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -193,14 +192,9 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = select! {
+        let result = sql_over_http::handle(config, request, sni_hostname)
-            _ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
+            .instrument(info_span!("sql-over-http"))
-                Err(anyhow::anyhow!("Query timed out"))
+            .await;
            }
            response = sql_over_http::handle(config, request, sni_hostname) => {
                response
            }
        };
        let status_code = match result {
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -22,7 +22,7 @@ use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 /// Number of times we should retry the `/proxy_wake_compute` http request.
-const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
+pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -283,34 +283,35 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
    }
 }
 /// If we couldn't connect, a cached connection info might be to blame
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
 pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
    let is_cached = node_info.cached();
    if is_cached {
        warn!("invalidating stalled compute node info cache entry");
        node_info.invalidate();
    }
    let label = match is_cached {
        true => "compute_cached",
        false => "compute_uncached",
    };
    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
 }
 /// Try to connect to the compute node once.
 #[tracing::instrument(name = "connect_once", skip_all)]
 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
    // If we couldn't connect, a cached connection info might be to blame
    // (e.g. the compute node's address might've changed at the wrong time).
    // Invalidate the cache entry (if any) to prevent subsequent errors.
    let invalidate_cache = |_: &compute::ConnectionError| {
        let is_cached = node_info.cached();
        if is_cached {
            warn!("invalidating stalled compute node info cache entry");
            node_info.invalidate();
        }
        let label = match is_cached {
            true => "compute_cached",
            false => "compute_uncached",
        };
        NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
    };
    let allow_self_signed_compute = node_info.allow_self_signed_compute;
    node_info
        .config
        .connect(allow_self_signed_compute)
-        .inspect_err(invalidate_cache)
+        .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
        .await
 }
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,4 +1,4 @@
-///! A group of high-level tests for connection establishing logic and auth.
+//! A group of high-level tests for connection establishing logic and auth.
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.68.2"
+channel = "1.70.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -3,15 +3,19 @@
 //
 use anyhow::{bail, Context, Result};
 use clap::Parser;
 use futures::future::BoxFuture;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
 use remote_storage::RemoteStorageConfig;
 use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
 use toml_edit::Document;
 use utils::signals::ShutdownSignals;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::thread;
 use std::time::Duration;
 use storage_broker::Uri;
 use tokio::sync::mpsc;
@@ -20,22 +24,21 @@ use tracing::*;
 use utils::pid_file;
 use metrics::set_build_info_metric;
 use safekeeper::broker;
 use safekeeper::control_file;
 use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
 use safekeeper::remove_wal;
 use safekeeper::wal_backup;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
 use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::JwtAuth;
 use utils::{
    http::endpoint,
    id::NodeId,
    logging::{self, LogFormat},
    project_git_version,
@@ -104,10 +107,6 @@ struct Args {
    /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
    #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
    max_offloader_lag: u64,
    /// Number of threads for wal backup runtime, by default number of cores
    /// available to the system.
    #[arg(long)]
    wal_backup_threads: Option<usize>,
    /// Number of max parallel WAL segments to be offloaded to remote storage.
    #[arg(long, default_value = "5")]
    wal_backup_parallel_jobs: usize,
@@ -121,9 +120,14 @@ struct Args {
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
    /// Run everything in single threaded current thread runtime, might be
    /// useful for debugging.
    #[arg(long)]
    current_thread_runtime: bool,
 }
-fn main() -> anyhow::Result<()> {
+#[tokio::main(flavor = "current_thread")]
 async fn main() -> anyhow::Result<()> {
    let args = Args::parse();
    if let Some(addr) = args.dump_control_file {
@@ -183,10 +187,10 @@ fn main() -> anyhow::Result<()> {
        heartbeat_timeout: args.heartbeat_timeout,
        remote_storage: args.remote_storage,
        max_offloader_lag_bytes: args.max_offloader_lag,
        backup_runtime_threads: args.wal_backup_threads,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
        auth,
        current_thread_runtime: args.current_thread_runtime,
    };
    // initialize sentry if SENTRY_DSN is provided
@@ -194,10 +198,14 @@ fn main() -> anyhow::Result<()> {
        Some(GIT_VERSION.into()),
        &[("node_id", &conf.my_id.to_string())],
    );
-    start_safekeeper(conf)
+    start_safekeeper(conf).await
 }
-fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+/// Result of joining any of main tasks: upper error means task failed to
 /// complete, e.g. panicked, inner is error produced by task itself.
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // Prevent running multiple safekeepers on the same directory
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file =
@@ -208,14 +216,18 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    // we need to release the lock file only when the current process is gone
    std::mem::forget(lock_file);
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
+    info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
-        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
+    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
        e
    })?;
-    info!("starting safekeeper on {}", conf.listen_pg_addr);
+    info!(
-    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
+        "starting safekeeper HTTP service on {}",
-        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+        conf.listen_http_addr
    );
    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
        e
    })?;
@@ -224,71 +236,88 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
    metrics::register_internal(Box::new(timeline_collector))?;
    let mut threads = vec![];
    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
    // Load all timelines from disk to memory.
    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
-    let conf_ = conf.clone();
+    // Keep handles to main tasks to die if any of them disappears.
-    threads.push(
+    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
-        thread::Builder::new()
+        FuturesUnordered::new();
            .name("http_endpoint_thread".into())
            .spawn(|| {
                let router = http::make_router(conf_);
                endpoint::serve_thread_main(
                    router,
                    http_listener,
                    std::future::pending(), // never shut down
                )
                .unwrap();
            })?,
    );
    let conf_cloned = conf.clone();
    let safekeeper_thread = thread::Builder::new()
        .name("WAL service thread".into())
        .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
        .unwrap();
    threads.push(safekeeper_thread);
    let conf_ = conf.clone();
-    threads.push(
+    // Run everything in current thread rt, if asked.
-        thread::Builder::new()
+    if conf.current_thread_runtime {
-            .name("broker thread".into())
+        info!("running in current thread runtime");
-            .spawn(|| {
+    }
-                broker::thread_main(conf_);
+    let current_thread_rt = conf
-            })?,
+        .current_thread_runtime
-    );
+        .then(|| Handle::try_current().expect("no runtime in main"));
    let wal_service_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
        .spawn(wal_service::task_main(conf_, pg_listener))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
    tasks_handles.push(Box::pin(wal_service_handle));
    let conf_ = conf.clone();
-    threads.push(
+    let http_handle = current_thread_rt
-        thread::Builder::new()
+        .as_ref()
-            .name("WAL removal thread".into())
+        .unwrap_or_else(|| HTTP_RUNTIME.handle())
-            .spawn(|| {
+        .spawn(http::task_main(conf_, http_listener))
-                remove_wal::thread_main(conf_);
+        .map(|res| ("HTTP service main".to_owned(), res));
-            })?,
+    tasks_handles.push(Box::pin(http_handle));
    );
-    threads.push(
+    let conf_ = conf.clone();
-        thread::Builder::new()
+    let broker_task_handle = current_thread_rt
-            .name("WAL backup launcher thread".into())
+        .as_ref()
-            .spawn(move || {
+        .unwrap_or_else(|| BROKER_RUNTIME.handle())
-                wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
+        .spawn(broker::task_main(conf_).instrument(info_span!("broker")))
-            })?,
+        .map(|res| ("broker main".to_owned(), res));
-    );
+    tasks_handles.push(Box::pin(broker_task_handle));
    let conf_ = conf.clone();
    let wal_remover_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
        .spawn(remove_wal::task_main(conf_))
        .map(|res| ("WAL remover".to_owned(), res));
    tasks_handles.push(Box::pin(wal_remover_handle));
    let conf_ = conf.clone();
    let wal_backup_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
        .spawn(wal_backup::wal_backup_launcher_task_main(
            conf_,
            wal_backup_launcher_rx,
        ))
        .map(|res| ("WAL backup launcher".to_owned(), res));
    tasks_handles.push(Box::pin(wal_backup_handle));
    set_build_info_metric(GIT_VERSION);
    // TODO: put more thoughts into handling of failed threads
    // We should catch & die if they are in trouble.
-    // On any shutdown signal, log receival and exit. Additionally, handling
+    // TODO: update tokio-stream, convert to real async Stream with
-    // SIGQUIT prevents coredump.
+    // SignalStream, map it to obtain missing signal name, combine streams into
-    ShutdownSignals::handle(|signal| {
+    // single stream we can easily sit on.
-        info!("received {}, terminating", signal.name());
+    let mut sigquit_stream = signal(SignalKind::quit())?;
-        std::process::exit(0);
+    let mut sigint_stream = signal(SignalKind::interrupt())?;
-    })
+    let mut sigterm_stream = signal(SignalKind::terminate())?;
    tokio::select! {
        Some((task_name, res)) = tasks_handles.next()=> {
            error!("{} task failed: {:?}, exiting", task_name, res);
            std::process::exit(1);
        }
        // On any shutdown signal, log receival and exit. Additionally, handling
        // SIGQUIT prevents coredump.
        _ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"),
        _ = sigint_stream.recv() => info!("received SIGINT, terminating"),
        _ = sigterm_stream.recv() => info!("received SIGTERM, terminating")
    };
    std::process::exit(0);
 }
 /// Determine safekeeper id.
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -8,7 +8,7 @@ use anyhow::Error;
 use anyhow::Result;
 use storage_broker::parse_proto_ttid;
-use storage_broker::proto::broker_service_client::BrokerServiceClient;
+
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::Request;
@@ -16,7 +16,7 @@ use storage_broker::Request;
 use std::time::Duration;
 use std::time::Instant;
 use tokio::task::JoinHandle;
-use tokio::{runtime, time::sleep};
+use tokio::time::sleep;
 use tracing::*;
 use crate::metrics::BROKER_ITERATION_TIMELINES;
@@ -29,23 +29,10 @@ use crate::SafeKeeperConf;
 const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
 pub fn thread_main(conf: SafeKeeperConf) {
    let runtime = runtime::Builder::new_current_thread()
        .enable_all()
        .build()
        .unwrap();
    let _enter = info_span!("broker").entered();
    info!("started, broker endpoint {:?}", conf.broker_endpoint);
    runtime.block_on(async {
        main_loop(conf).await;
    });
 }
 /// Push once in a while data about all active timelines to the broker.
 async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
+    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
    let outbound = async_stream::stream! {
@@ -55,20 +42,27 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
            // sensitive and there is no risk of deadlock as we don't await while
            // lock is held.
            let now = Instant::now();
-            let mut active_tlis = GlobalTimelines::get_all();
+            let all_tlis = GlobalTimelines::get_all();
-            active_tlis.retain(|tli| tli.is_active());
+            let mut n_pushed_tlis = 0;
-            for tli in &active_tlis {
+            for tli in &all_tlis {
-                let sk_info = tli.get_safekeeper_info(&conf);
+                // filtering alternative futures::stream::iter(all_tlis)
                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
                // doesn't look better, and I'm not sure how to do that without collect.
                if !tli.is_active().await {
                    continue;
                }
                let sk_info = tli.get_safekeeper_info(&conf).await;
                yield sk_info;
                BROKER_PUSHED_UPDATES.inc();
                n_pushed_tlis += 1;
            }
            let elapsed = now.elapsed();
            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
-            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
+            BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64);
            if elapsed > push_interval / 2 {
-                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed);
            }
            sleep(push_interval).await;
@@ -125,10 +119,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
    bail!("end of stream");
 }
-async fn main_loop(conf: SafeKeeperConf) {
+pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    info!("started, broker endpoint {:?}", conf.broker_endpoint);
    let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
    let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
    let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
    // Selecting on JoinHandles requires some squats; is there a better way to
    // reap tasks individually?
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,11 +2,13 @@
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use tokio::fs::{self, File};
 use tokio::io::AsyncWriteExt;
-use std::fs::{self, File, OpenOptions};
+use std::io::Read;
 use std::io::{Read, Write};
 use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::time::Instant;
 use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
@@ -25,9 +27,13 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
 /// Storage should keep actual state inside of it. It should implement Deref
 /// trait to access state fields and have persist method for updating that state.
 #[async_trait::async_trait]
 pub trait Storage: Deref<Target = SafeKeeperState> {
    /// Persist safekeeper state on disk and update internal state.
-    fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
+    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
    /// Timestamp of last persist.
    fn last_persist_at(&self) -> Instant;
 }
 #[derive(Debug)]
@@ -38,6 +44,8 @@ pub struct FileStorage {
    /// Last state persisted to disk.
    state: SafeKeeperState,
    /// Not preserved across restarts.
    last_persist_at: Instant,
 }
 impl FileStorage {
@@ -51,6 +59,7 @@ impl FileStorage {
            timeline_dir,
            conf: conf.clone(),
            state,
            last_persist_at: Instant::now(),
        })
    }
@@ -66,6 +75,7 @@ impl FileStorage {
            timeline_dir,
            conf: conf.clone(),
            state,
            last_persist_at: Instant::now(),
        };
        Ok(store)
@@ -74,7 +84,7 @@ impl FileStorage {
    /// Check the magic/version in the on-disk data and deserialize it, if possible.
    fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
        // Read the version independent part
-        let magic = buf.read_u32::<LittleEndian>()?;
+        let magic = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
        if magic != SK_MAGIC {
            bail!(
                "bad control file magic: {:X}, expected {:X}",
@@ -82,7 +92,7 @@ impl FileStorage {
                SK_MAGIC
            );
        }
-        let version = buf.read_u32::<LittleEndian>()?;
+        let version = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
        if version == SK_FORMAT_VERSION {
            let res = SafeKeeperState::des(buf)?;
            return Ok(res);
@@ -102,7 +112,7 @@ impl FileStorage {
    /// Read in the control file.
    pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
-        let mut control_file = OpenOptions::new()
+        let mut control_file = std::fs::OpenOptions::new()
            .read(true)
            .write(true)
            .open(&control_file_path)
@@ -151,30 +161,31 @@ impl Deref for FileStorage {
    }
 }
 #[async_trait::async_trait]
 impl Storage for FileStorage {
    /// persists state durably to underlying storage
    /// for description see https://lwn.net/Articles/457667/
-    fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
+    async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
        let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();
        // write data to safekeeper.control.partial
        let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
-        let mut control_partial = File::create(&control_partial_path).with_context(|| {
+        let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
            format!(
                "failed to create partial control file at: {}",
                &control_partial_path.display()
            )
        })?;
        let mut buf: Vec<u8> = Vec::new();
-        buf.write_u32::<LittleEndian>(SK_MAGIC)?;
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
-        buf.write_u32::<LittleEndian>(SK_FORMAT_VERSION)?;
+        WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
        s.ser_into(&mut buf)?;
        // calculate checksum before resize
        let checksum = crc32c::crc32c(&buf);
        buf.extend_from_slice(&checksum.to_le_bytes());
-        control_partial.write_all(&buf).with_context(|| {
+        control_partial.write_all(&buf).await.with_context(|| {
            format!(
                "failed to write safekeeper state into control file at: {}",
                control_partial_path.display()
@@ -183,7 +194,7 @@ impl Storage for FileStorage {
        // fsync the file
        if !self.conf.no_sync {
-            control_partial.sync_all().with_context(|| {
+            control_partial.sync_all().await.with_context(|| {
                format!(
                    "failed to sync partial control file at {}",
                    control_partial_path.display()
@@ -194,21 +205,22 @@ impl Storage for FileStorage {
        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
        // rename should be atomic
-        fs::rename(&control_partial_path, &control_path)?;
+        fs::rename(&control_partial_path, &control_path).await?;
        // this sync is not required by any standard but postgres does this (see durable_rename)
        if !self.conf.no_sync {
-            File::open(&control_path)
+            let new_f = File::open(&control_path).await?;
-                .and_then(|f| f.sync_all())
+            new_f.sync_all().await.with_context(|| {
-                .with_context(|| {
+                format!(
-                    format!(
+                    "failed to sync control file at: {}",
-                        "failed to sync control file at: {}",
+                    &control_path.display()
-                        &control_path.display()
+                )
-                    )
+            })?;
                })?;
            // fsync the directory (linux specific)
-            File::open(&self.timeline_dir)
+            let tli_dir = File::open(&self.timeline_dir).await?;
-                .and_then(|f| f.sync_all())
+            tli_dir
                .sync_all()
                .await
                .context("failed to sync control file directory")?;
        }
@@ -216,6 +228,10 @@ impl Storage for FileStorage {
        self.state = s.clone();
        Ok(())
    }
    fn last_persist_at(&self) -> Instant {
        self.last_persist_at
    }
 }
 #[cfg(test)]
@@ -224,7 +240,6 @@ mod test {
    use super::*;
    use crate::{safekeeper::SafeKeeperState, SafeKeeperConf};
    use anyhow::Result;
    use std::fs;
    use utils::{id::TenantTimelineId, lsn::Lsn};
    fn stub_conf() -> SafeKeeperConf {
@@ -235,59 +250,75 @@ mod test {
        }
    }
-    fn load_from_control_file(
+    async fn load_from_control_file(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        Ok((
            FileStorage::restore_new(ttid, conf)?,
            FileStorage::load_control_file_conf(conf, ttid)?,
        ))
    }
-    fn create(
+    async fn create(
        conf: &SafeKeeperConf,
        ttid: &TenantTimelineId,
    ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid))
            .await
            .expect("failed to create timeline dir");
        let state = SafeKeeperState::empty();
        let storage = FileStorage::create_new(ttid, conf, state.clone())?;
        Ok((storage, state))
    }
-    #[test]
+    #[tokio::test]
-    fn test_read_write_safekeeper_state() {
+    async fn test_read_write_safekeeper_state() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state");
+            let (mut storage, mut state) =
                create(&conf, &ttid).await.expect("failed to create state");
            // change something
            state.commit_lsn = Lsn(42);
-            storage.persist(&state).expect("failed to persist state");
+            storage
                .persist(&state)
                .await
                .expect("failed to persist state");
        }
-        let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state");
+        let (_, state) = load_from_control_file(&conf, &ttid)
            .await
            .expect("failed to read state");
        assert_eq!(state.commit_lsn, Lsn(42));
    }
-    #[test]
+    #[tokio::test]
-    fn test_safekeeper_state_checksum_mismatch() {
+    async fn test_safekeeper_state_checksum_mismatch() {
        let conf = stub_conf();
        let ttid = TenantTimelineId::generate();
        {
-            let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state");
+            let (mut storage, mut state) =
                create(&conf, &ttid).await.expect("failed to read state");
            // change something
            state.commit_lsn = Lsn(42);
-            storage.persist(&state).expect("failed to persist state");
+            storage
                .persist(&state)
                .await
                .expect("failed to persist state");
        }
        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
-        let mut data = fs::read(&control_path).unwrap();
+        let mut data = fs::read(&control_path).await.unwrap();
        data[0] += 1; // change the first byte of the file to fail checksum validation
-        fs::write(&control_path, &data).expect("failed to write control file");
+        fs::write(&control_path, &data)
            .await
            .expect("failed to write control file");
-        match load_from_control_file(&conf, &ttid) {
+        match load_from_control_file(&conf, &ttid).await {
            Err(err) => assert!(err
                .to_string()
                .contains("safekeeper control file checksum mismatch")),
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -121,7 +121,7 @@ pub struct FileInfo {
 }
 /// Build debug dump response, using the provided [`Args`] filters.
-pub fn build(args: Args) -> Result<Response> {
+pub async fn build(args: Args) -> Result<Response> {
    let start_time = Utc::now();
    let timelines_count = GlobalTimelines::timelines_count();
@@ -155,7 +155,7 @@ pub fn build(args: Args) -> Result<Response> {
        }
        let control_file = if args.dump_control_file {
-            let mut state = tli.get_state().1;
+            let mut state = tli.get_state().await.1;
            if !args.dump_term_history {
                state.acceptor_state.term_history = TermHistory(vec![]);
            }
@@ -165,7 +165,7 @@ pub fn build(args: Args) -> Result<Response> {
        };
        let memory = if args.dump_memory {
-            Some(tli.memory_dump())
+            Some(tli.memory_dump().await)
        } else {
            None
        };
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler {
        let lsn = if self.is_walproposer_recovery() {
            // walproposer should get all local WAL until flush_lsn
-            tli.get_flush_lsn()
+            tli.get_flush_lsn().await
        } else {
            // other clients shouldn't get any uncommitted WAL
-            tli.get_state().0.commit_lsn
+            tli.get_state().await.0.commit_lsn
        }
        .to_string();
-        let sysid = tli.get_state().1.server.system_id.to_string();
+        let sysid = tli.get_state().await.1.server.system_id.to_string();
        let lsn_bytes = lsn.as_bytes();
        let tli = PG_TLI.to_string();
        let tli_bytes = tli.as_bytes();
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -2,3 +2,18 @@ pub mod routes;
 pub use routes::make_router;
 pub use safekeeper_api::models;
 use crate::SafeKeeperConf;
 pub async fn task_main(
    conf: SafeKeeperConf,
    http_listener: std::net::TcpListener,
 ) -> anyhow::Result<()> {
    let router = make_router(conf)
        .build()
        .map_err(|err| anyhow::anyhow!(err))?;
    let service = utils::http::RouterService::new(router).unwrap();
    let server = hyper::Server::from_tcp(http_listener)?;
    server.serve(service).await?;
    Ok(()) // unreachable
 }
--- a/Show More
+++ b/Show More