try: catch all bad tests by removing the implicit endpoint creation

test: do not start two primary endpoints on same branch
test: allow passing branch-name to endpoint_start
2026-01-24 13:50:37 +00:00 · 2023-07-27 16:47:46 +03:00 · 2023-07-27 14:32:13 +03:00 · 2023-07-27 14:31:53 +03:00 · 2023-07-27 14:31:04 +03:00 · 2023-07-26 15:55:55 -04:00
161 changed files with 4637 additions and 2252 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -18,8 +18,8 @@
 !trace/
 !vendor/postgres-v14/
 !vendor/postgres-v15/
-!vendor/postgres-v16/
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -150,6 +150,14 @@ runs:
          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi

+        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
+        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
+          mkdir -p $TEST_OUTPUT
+          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
+
+          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
+        fi
+
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
@@ -201,4 +209,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}
+        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -396,13 +396,11 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
+        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 1

      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
@@ -411,9 +409,11 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
+          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -955,22 +955,15 @@ jobs:
        version: [ v14, v15 ]

    env:
-      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
-      # Later all the extensions will be moved to extensions image.
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
-      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: |
-        ${{ github.ref_name == 'release' &&
-          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
-          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
+      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}

    steps:
      - name: Pull postgres-extensions image
        run: |
          docker pull ${EXTENSIONS_IMAGE}
-          docker pull ${COMPUTE_NODE_IMAGE}

      - name: Create postgres-extensions container
        id: create-container
@@ -978,44 +971,23 @@ jobs:
          EID=$(docker create ${EXTENSIONS_IMAGE} true)
          echo "EID=${EID}" >> $GITHUB_OUTPUT

-          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
-          echo "CID=${CID}" >> $GITHUB_OUTPUT
-
      - name: Extract postgres-extensions from container
        run: |
-          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
+          rm -rf ./extensions-to-upload # Just in case
+          mkdir -p extensions-to-upload

-          # In compute image we have a bit different directory layout
-          mkdir -p extensions-to-upload/share
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
-
-          # Delete Neon extensitons (they always present on compute-node image)
-          rm -rf ./extensions-to-upload/share/extension/neon*
-          rm -rf ./extensions-to-upload/lib/neon*
-
-          # Delete leftovers from the extension build step
-          rm -rf ./extensions-to-upload/lib/pgxs
-          rm -rf ./extensions-to-upload/lib/pkgconfig
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
-          for EXT_NAME in $(ls ./custom-extensions); do
-            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
-
-            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
-            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
-          done
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
+          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/

      - name: Upload postgres-extensions to S3
        run: |
-          for BUCKET in $(echo ${S3_BUCKETS}); do
+          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done

      - name: Cleanup
-        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
+        if: ${{ always() && steps.create-container.outputs.EID }}
        run: |
-          docker rm ${{ steps.create-container.outputs.CID }} || true
          docker rm ${{ steps.create-container.outputs.EID }} || true

  deploy:
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,7 +6,3 @@
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_15_STABLE_neon
-[submodule "vendor/postgres-v16"]
-	path = vendor/postgres-v16
-	url = https://github.com/neondatabase/postgres.git
-	branch = REL_16_STABLE_neon
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2506,6 +2506,7 @@ dependencies = [
 "pageserver",
 "postgres_ffi",
 "svg_fmt",
+ "tokio",
 "utils",
 "workspace_hack",
 ]
@@ -2544,6 +2545,7 @@ dependencies = [
 "metrics",
 "nix",
 "num-traits",
+ "num_cpus",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
@@ -2780,7 +2782,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2793,7 +2795,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2804,7 +2806,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2822,7 +2824,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3854,7 +3856,8 @@ dependencies = [
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
-source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
 dependencies = [
 "lazy_static",
 ]
@@ -4098,7 +4101,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
 dependencies = [
 "filetime",
 "libc",
- "xattr",
+ "xattr 0.2.3",
 ]

 [[package]]
@@ -4311,7 +4314,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4379,16 +4382,17 @@ dependencies = [

 [[package]]
 name = "tokio-tar"
-version = "0.3.0"
-source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
 dependencies = [
 "filetime",
 "futures-core",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "tokio",
 "tokio-stream",
- "xattr",
+ "xattr 1.0.0",
 ]

 [[package]]
@@ -4865,6 +4869,7 @@ dependencies = [
 "tempfile",
 "thiserror",
 "tokio",
+ "tokio-stream",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -5362,6 +5367,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "xattr"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "xmlparser"
 version = "0.13.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -124,6 +124,7 @@ tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
+tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
 toml = "0.7"
 toml_edit = "0.19"
@@ -143,12 +144,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,12 +183,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-
-# Changes the MAX_THREADS limit from 4096 to 32768.
-# This is a temporary workaround for using tracing from many threads in safekeepers code,
-# until async safekeepers patch is merged to the main.
-sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ################# Binary contents sections

--- a/3
+++ b/3
@@ -12,7 +12,6 @@ WORKDIR /home/nonroot

 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
-COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -40,7 +39,6 @@ ARG CACHEPOT_BUCKET=neon-github-dev

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
@@ -81,7 +79,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
-COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd

 #########################################################################################
 #
@@ -77,6 +77,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -89,17 +90,28 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
+    mkdir -p /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis

 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -

 #########################################################################################
 #
@@ -419,12 +431,16 @@ RUN apt-get update && \
    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
    mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -

 #########################################################################################
 #
@@ -535,10 +551,10 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
+# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
 # There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
-    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
+    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -553,16 +569,17 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sort  > /before.txt && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sort  > /after.txt && \
-    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -

 #########################################################################################
 #
@@ -754,16 +771,23 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Extenstion only
 #
 #########################################################################################
+FROM python:3.9-slim-bullseye AS generate-ext-index
+ARG PG_VERSION
+ARG BUILD_TAG
+RUN apt update && apt install -y zstd
+
+# copy the control files here
+COPY --from=kq-imcx-pg-build /extensions/ /extensions/
+COPY --from=pg-anon-pg-build /extensions/ /extensions/
+COPY --from=postgis-build /extensions/ /extensions/
+COPY scripts/combine_control_files.py ./combine_control_files.py
+RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
+
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
-# As for now, it's only for new custom ones
-#
-# # Default extensions
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
-# Custom extensions
-COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
-COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
+# As for now, it's only a couple for testing purposses
+COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
+COPY --from=generate-ext-index /ext_index.json /ext_index.json

 #########################################################################################
 #
--- a/17
+++ b/17
@@ -83,8 +83,6 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
-.PHONY: postgres-configure-v16
-postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
 .PHONY: postgres-configure-v14
@@ -167,33 +165,28 @@ neon-pg-ext-clean-%:
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
-	neon-pg-ext-v15 \
-	neon-pg-ext-v16
+	neon-pg-ext-v15

 .PHONY: neon-pg-ext-clean
 neon-pg-ext-clean: \
 	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15 \
-	neon-pg-ext-clean-v16
+	neon-pg-ext-clean-v15

 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
-	postgres-v15 \
-	postgres-v16
+	postgres-v15

 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
-	postgres-headers-v15 \
-	postgres-headers-v16
+	postgres-headers-v15

 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
-	postgres-clean-v15 \
-	postgres-clean-v16
+	postgres-clean-v15

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -223,9 +223,8 @@ fn main() -> Result<()> {
    drop(state);

    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
-    let _configurator_handle =
-        launch_configurator(&compute).expect("cannot launch configurator thread");
+    let _monitor_handle = launch_monitor(&compute);
+    let _configurator_handle = launch_configurator(&compute);

    // Start Postgres
    let mut delay_exit = false;
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -8,9 +8,11 @@ use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio_postgres;
-use tracing::{info, instrument, warn};
+use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -21,6 +23,7 @@ use utils::measured_stream::MeasuredReader;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::sync_sk::{check_if_synced, ping_safekeeper};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -86,6 +89,7 @@ pub struct ParsedSpec {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub pageserver_connstr: String,
+    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
 }

@@ -103,6 +107,21 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
+        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
+            if matches!(spec.mode, ComputeMode::Primary) {
+                spec.cluster
+                    .settings
+                    .find("neon.safekeepers")
+                    .ok_or("safekeeper connstrings should be provided")?
+                    .split(',')
+                    .map(|str| str.to_string())
+                    .collect()
+            } else {
+                vec![]
+            }
+        } else {
+            spec.safekeeper_connstrings.clone()
+        };
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -128,6 +147,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        Ok(ParsedSpec {
            spec,
            pageserver_connstr,
+            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
@@ -309,6 +329,102 @@ impl ComputeNode {
        Ok(())
    }

+    pub async fn check_safekeepers_synced_async(
+        &self,
+        compute_state: &ComputeState,
+    ) -> Result<Option<Lsn>> {
+        // Construct a connection config for each safekeeper
+        let pspec: ParsedSpec = compute_state
+            .pspec
+            .as_ref()
+            .expect("spec must be set")
+            .clone();
+        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
+        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
+            // Format connstr
+            let id = connstr.clone();
+            let connstr = format!("postgresql://no_user@{}", connstr);
+            let options = format!(
+                "-c timeline_id={} tenant_id={}",
+                pspec.timeline_id, pspec.tenant_id
+            );
+
+            // Construct client
+            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
+            config.options(&options);
+            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
+                config.password(storage_auth_token);
+            }
+
+            (id, config)
+        });
+
+        // Create task set to query all safekeepers
+        let mut tasks = FuturesUnordered::new();
+        let quorum = sk_configs.len() / 2 + 1;
+        for (id, config) in sk_configs {
+            let timeout = tokio::time::Duration::from_millis(100);
+            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
+            tasks.push(tokio::spawn(task));
+        }
+
+        // Get a quorum of responses or errors
+        let mut responses = Vec::new();
+        let mut join_errors = Vec::new();
+        let mut task_errors = Vec::new();
+        let mut timeout_errors = Vec::new();
+        while let Some(response) = tasks.next().await {
+            match response {
+                Ok(Ok(Ok(r))) => responses.push(r),
+                Ok(Ok(Err(e))) => task_errors.push(e),
+                Ok(Err(e)) => timeout_errors.push(e),
+                Err(e) => join_errors.push(e),
+            };
+            if responses.len() >= quorum {
+                break;
+            }
+            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
+                break;
+            }
+        }
+
+        // In case of error, log and fail the check, but don't crash.
+        // We're playing it safe because these errors could be transient
+        // and we don't yet retry. Also being careful here allows us to
+        // be backwards compatible with safekeepers that don't have the
+        // TIMELINE_STATUS API yet.
+        if responses.len() < quorum {
+            error!(
+                "failed sync safekeepers check {:?} {:?} {:?}",
+                join_errors, task_errors, timeout_errors
+            );
+            return Ok(None);
+        }
+
+        Ok(check_if_synced(responses))
+    }
+
+    // Fast path for sync_safekeepers. If they're already synced we get the lsn
+    // in one roundtrip. If not, we should do a full sync_safekeepers.
+    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
+        let start_time = Utc::now();
+
+        // Run actual work with new tokio runtime
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create rt");
+        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
+
+        // Record runtime
+        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        result
+    }
+
    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
    #[instrument(skip_all)]
@@ -371,10 +487,14 @@ impl ComputeNode {
        // cannot sync safekeepers.
        let lsn = match spec.mode {
            ComputeMode::Primary => {
-                info!("starting safekeepers syncing");
-                let lsn = self
-                    .sync_safekeepers(pspec.storage_auth_token.clone())
-                    .with_context(|| "failed to sync safekeepers")?;
+                info!("checking if safekeepers are synced");
+                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
+                    lsn
+                } else {
+                    info!("starting safekeepers syncing");
+                    self.sync_safekeepers(pspec.storage_auth_token.clone())
+                        .with_context(|| "failed to sync safekeepers")?
+                };
                info!("safekeepers synced at LSN {}", lsn);
                lsn
            }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 use std::thread;

-use anyhow::Result;
 use tracing::{error, info, instrument};

 use compute_api::responses::ComputeStatus;
@@ -42,13 +41,14 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    }
 }

-pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let compute = Arc::clone(compute);

-    Ok(thread::Builder::new()
+    thread::Builder::new()
        .name("compute-configurator".into())
        .spawn(move || {
            configurator_main_loop(&compute);
            info!("configurator thread is exited");
-        })?)
+        })
+        .expect("cannot launch configurator thread")
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -13,3 +13,4 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+pub mod sync_sk;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;
 use std::{thread, time};

-use anyhow::Result;
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
 use tracing::{debug, info};
@@ -105,10 +104,11 @@ fn watch_compute_activity(compute: &ComputeNode) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let state = Arc::clone(state);

-    Ok(thread::Builder::new()
+    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&state))?)
+        .spawn(move || watch_compute_activity(&state))
+        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -0,0 +1,98 @@
+// Utils for running sync_safekeepers
+use anyhow::Result;
+use tracing::info;
+use utils::lsn::Lsn;
+
+#[derive(Copy, Clone, Debug)]
+pub enum TimelineStatusResponse {
+    NotFound,
+    Ok(TimelineStatusOkResponse),
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct TimelineStatusOkResponse {
+    flush_lsn: Lsn,
+    commit_lsn: Lsn,
+}
+
+/// Get a safekeeper's metadata for our timeline. The id is only used for logging
+pub async fn ping_safekeeper(
+    id: String,
+    config: tokio_postgres::Config,
+) -> Result<TimelineStatusResponse> {
+    // TODO add retries
+
+    // Connect
+    info!("connecting to {}", id);
+    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
+    tokio::spawn(async move {
+        if let Err(e) = conn.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    // Query
+    info!("querying {}", id);
+    let result = client.simple_query("TIMELINE_STATUS").await?;
+
+    // Parse result
+    info!("done with {}", id);
+    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
+        use std::str::FromStr;
+        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
+            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
+            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
+        });
+        Ok(response)
+    } else {
+        // Timeline doesn't exist
+        Ok(TimelineStatusResponse::NotFound)
+    }
+}
+
+/// Given a quorum of responses, check if safekeepers are synced at some Lsn
+pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
+    // Check if all responses are ok
+    let ok_responses: Vec<TimelineStatusOkResponse> = responses
+        .iter()
+        .filter_map(|r| match r {
+            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
+            _ => None,
+        })
+        .cloned()
+        .collect();
+    if ok_responses.len() < responses.len() {
+        info!(
+            "not synced. Only {} out of {} know about this timeline",
+            ok_responses.len(),
+            responses.len()
+        );
+        return None;
+    }
+
+    // Get the min and the max of everything
+    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
+    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
+    let commit_max = commit.iter().max().unwrap();
+    let commit_min = commit.iter().min().unwrap();
+    let flush_max = flush.iter().max().unwrap();
+    let flush_min = flush.iter().min().unwrap();
+
+    // Check that all values are equal
+    if commit_min != commit_max {
+        info!("not synced. {:?} {:?}", commit_min, commit_max);
+        return None;
+    }
+    if flush_min != flush_max {
+        info!("not synced. {:?} {:?}", flush_min, flush_max);
+        return None;
+    }
+
+    // Check that commit == flush
+    if commit_max != flush_max {
+        info!("not synced. {:?} {:?}", commit_max, flush_max);
+        return None;
+    }
+
+    Some(*commit_max)
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -652,8 +652,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            )?;
        }
        "start" => {
-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
+            // let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            // let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
@@ -673,7 +673,10 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    env.safekeepers.iter().map(|sk| sk.id).collect()
                };

-            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;

            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
@@ -688,63 +691,17 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

-            if let Some(endpoint) = endpoint {
-                match (&endpoint.mode, hot_standby) {
-                    (ComputeMode::Static(_), true) => {
-                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
-                    }
-                    (ComputeMode::Primary, true) => {
-                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
-                    }
-                    _ => {}
+            match (&endpoint.mode, hot_standby) {
+                (ComputeMode::Static(_), true) => {
+                    bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
                }
-                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
-            } else {
-                let branch_name = sub_args
-                    .get_one::<String>("branch-name")
-                    .map(|s| s.as_str())
-                    .unwrap_or(DEFAULT_BRANCH_NAME);
-                let timeline_id = env
-                    .get_branch_timeline_id(branch_name, tenant_id)
-                    .ok_or_else(|| {
-                        anyhow!("Found no timeline id for branch name '{branch_name}'")
-                    })?;
-                let lsn = sub_args
-                    .get_one::<String>("lsn")
-                    .map(|lsn_str| Lsn::from_str(lsn_str))
-                    .transpose()
-                    .context("Failed to parse Lsn from the request")?;
-                let pg_version = sub_args
-                    .get_one::<u32>("pg-version")
-                    .copied()
-                    .context("Failed to `pg-version` from the argument string")?;
-
-                let mode = match (lsn, hot_standby) {
-                    (Some(lsn), false) => ComputeMode::Static(lsn),
-                    (None, true) => ComputeMode::Replica,
-                    (None, false) => ComputeMode::Primary,
-                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
-                };
-
-                // when used with custom port this results in non obvious behaviour
-                // port is remembered from first start command, i e
-                // start --port X
-                // stop
-                // start <-- will also use port X even without explicit port argument
-                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
-
-                let ep = cplane.new_endpoint(
-                    endpoint_id,
-                    tenant_id,
-                    timeline_id,
-                    pg_port,
-                    http_port,
-                    pg_version,
-                    mode,
-                )?;
-                ep.start(&auth_token, safekeepers)?;
+                (ComputeMode::Primary, true) => {
+                    bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                }
+                _ => {}
            }
+            println!("Starting existing endpoint {endpoint_id}...");
+            endpoint.start(&auth_token, safekeepers)?;
        }
        "stop" => {
            let endpoint_id = sub_args
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -128,6 +128,20 @@ impl ComputeControlPlane {
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+
+        if matches!(mode, ComputeMode::Primary) {
+            // this check is not complete, as you could have a concurrent attempt at
+            // creating another primary, both reading the state before checking it here,
+            // but it's better than nothing.
+            let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
+                v.tenant_id == tenant_id && v.timeline_id == timeline_id && v.mode == mode
+            });
+
+            if let Some((key, _)) = duplicates.next() {
+                bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
+            }
+        }
+
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
@@ -289,7 +303,7 @@ impl Endpoint {
                        .env
                        .safekeepers
                        .iter()
-                        .map(|sk| format!("localhost:{}", sk.pg_port))
+                        .map(|sk| format!("localhost:{}", sk.get_compute_port()))
                        .collect::<Vec<String>>()
                        .join(",");
                    conf.append("neon.safekeepers", &safekeepers);
@@ -318,7 +332,7 @@ impl Endpoint {
                    .env
                    .safekeepers
                    .iter()
-                    .map(|x| x.pg_port.to_string())
+                    .map(|x| x.get_compute_port().to_string())
                    .collect::<Vec<_>>()
                    .join(",");
                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
@@ -463,7 +477,7 @@ impl Endpoint {
                    .iter()
                    .find(|node| node.id == sk_id)
                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
            }
        }

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -137,6 +137,7 @@ impl Default for PageServerConf {
 pub struct SafekeeperConf {
    pub id: NodeId,
    pub pg_port: u16,
+    pub pg_tenant_only_port: Option<u16>,
    pub http_port: u16,
    pub sync: bool,
    pub remote_storage: Option<String>,
@@ -149,6 +150,7 @@ impl Default for SafekeeperConf {
        Self {
            id: NodeId(0),
            pg_port: 0,
+            pg_tenant_only_port: None,
            http_port: 0,
            sync: true,
            remote_storage: None,
@@ -158,6 +160,14 @@ impl Default for SafekeeperConf {
    }
 }

+impl SafekeeperConf {
+    /// Compute is served by port on which only tenant scoped tokens allowed, if
+    /// it is configured.
+    pub fn get_compute_port(&self) -> u16 {
+        self.pg_tenant_only_port.unwrap_or(self.pg_port)
+    }
+}
+
 impl LocalEnv {
    pub fn pg_distrib_dir_raw(&self) -> PathBuf {
        self.pg_distrib_dir.clone()
@@ -169,7 +179,6 @@ impl LocalEnv {
        match pg_version {
            14 => Ok(path.join(format!("v{pg_version}"))),
            15 => Ok(path.join(format!("v{pg_version}"))),
-            16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
@@ -178,7 +187,6 @@ impl LocalEnv {
        match pg_version {
            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
@@ -186,7 +194,6 @@ impl LocalEnv {
        match pg_version {
            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -120,45 +120,55 @@ impl SafekeeperNode {
        let availability_zone = format!("sk-{}", id_string);

        let mut args = vec![
-            "-D",
-            datadir.to_str().with_context(|| {
-                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
-            })?,
-            "--id",
-            &id_string,
-            "--listen-pg",
-            &listen_pg,
-            "--listen-http",
-            &listen_http,
-            "--availability-zone",
-            &availability_zone,
+            "-D".to_owned(),
+            datadir
+                .to_str()
+                .with_context(|| {
+                    format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+                })?
+                .to_owned(),
+            "--id".to_owned(),
+            id_string,
+            "--listen-pg".to_owned(),
+            listen_pg,
+            "--listen-http".to_owned(),
+            listen_http,
+            "--availability-zone".to_owned(),
+            availability_zone,
        ];
+        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
+            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
+            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
+        }
        if !self.conf.sync {
-            args.push("--no-sync");
+            args.push("--no-sync".to_owned());
        }

        let broker_endpoint = format!("{}", self.env.broker.client_url());
-        args.extend(["--broker-endpoint", &broker_endpoint]);
+        args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);

        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
            backup_threads = threads.to_string();
-            args.extend(["--backup-threads", &backup_threads]);
+            args.extend(["--backup-threads".to_owned(), backup_threads]);
        } else {
            drop(backup_threads);
        }

        if let Some(ref remote_storage) = self.conf.remote_storage {
-            args.extend(["--remote-storage", remote_storage]);
+            args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
        }

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
            args.extend([
-                "--auth-validation-public-key-path",
-                key_path.to_str().with_context(|| {
-                    format!("Key path {key_path:?} cannot be represented as a unicode string")
-                })?,
+                "--auth-validation-public-key-path".to_owned(),
+                key_path
+                    .to_str()
+                    .with_context(|| {
+                        format!("Key path {key_path:?} cannot be represented as a unicode string")
+                    })?
+                    .to_owned(),
            ]);
        }

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.

 In async Rust, futures can be "cancelled" at any await point, by
 dropping the Future. For example, `tokio::select!` returns as soon as
-one of the Futures returns, and drops the others. `tokio::timeout!` is
-another example. In the Rust ecosystem, some functions are
+one of the Futures returns, and drops the others. `tokio::time::timeout`
+is another example. In the Rust ecosystem, some functions are
 cancellation-safe, meaning they can be safely dropped without
 side-effects, while others are not. See documentation of
 `tokio::select!` for examples.
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
 cancellation-safe, and must be polled to completion.

 The downside of non-cancellation safe code is that you have to be very
-careful when using `tokio::select!`, `tokio::timeout!`, and other such
-functions that can cause a Future to be dropped. They can only be used
-with functions that are explicitly documented to be cancellation-safe,
+careful when using `tokio::select!`, `tokio::time::timeout`, and other
+such functions that can cause a Future to be dropped. They can only be
+used with functions that are explicitly documented to be cancellation-safe,
 or you need to spawn a separate task to shield from the cancellation.

 At the entry points to the code, we also take care to poll futures to
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -70,6 +70,7 @@ where
 pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
+    pub sync_sk_check_ms: u64,
    pub basebackup_ms: u64,
    pub basebackup_bytes: u64,
    pub start_postgres_ms: u64,
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -6,6 +6,7 @@ use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
 pub use prometheus::register;
+pub use prometheus::Error;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
+    completion,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -76,7 +77,12 @@ pub enum TenantState {
    /// system is being shut down.
    ///
    /// Transitions out of this state are possible through `set_broken()`.
-    Stopping,
+    Stopping {
+        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
+        // otherwise it will not be skipped during deserialization
+        #[serde(skip)]
+        progress: completion::Barrier,
+    },
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
    ///
@@ -118,7 +124,7 @@ impl TenantState {
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
-            Self::Stopping => Maybe,
+            Self::Stopping { .. } => Maybe,
        }
    }

@@ -928,7 +934,13 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
-            (line!(), TenantState::Stopping, "Stopping"),
+            (
+                line!(),
+                TenantState::Stopping {
+                    progress: utils::completion::Barrier::default(),
+                },
+                "Stopping",
+            ),
            (
                line!(),
                TenantState::Broken {
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
        PathBuf::from("pg_install")
    };

-    for pg_version in &["v14", "v15", "v16"] {
+    for pg_version in &["v14", "v15"] {
        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
        if pg_install_dir_versioned.is_relative() {
            let cwd = env::current_dir().context("Failed to get current_dir")?;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -51,7 +51,6 @@ macro_rules! for_all_postgres_versions {
    ($macro:tt) => {
        $macro!(v14);
        $macro!(v15);
-        $macro!(v16);
    };
 }

@@ -93,10 +92,9 @@ pub use v14::bindings::DBState_DB_SHUTDOWNED;
 pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
    match version {
        14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
-        15 | 16 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
+        15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
-
        _ => anyhow::bail!("Unknown version {}", version),
    }
 }
@@ -112,7 +110,6 @@ pub fn generate_wal_segment(
    match pg_version {
        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        16 => v16::xlog_utils::generate_wal_segment(segno, system_id, lsn),
        _ => Err(SerializeError::BadInput),
    }
 }
@@ -126,7 +123,6 @@ pub fn generate_pg_control(
    match pg_version {
        14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
        15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        16 => v16::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
        _ => anyhow::bail!("Unknown version {}", pg_version),
    }
 }
@@ -201,7 +197,7 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {

 pub mod waldecoder {

-    use crate::{v14, v15, v16};
+    use crate::{v14, v15};
    use bytes::{Buf, Bytes, BytesMut};
    use std::num::NonZeroU32;
    use thiserror::Error;
@@ -263,10 +259,6 @@ pub mod waldecoder {
                    use self::v15::waldecoder_handler::WalStreamDecoderHandler;
                    self.poll_decode_internal()
                }
-                16 => {
-                    use self::v16::waldecoder_handler::WalStreamDecoderHandler;
-                    self.poll_decode_internal()
-                }
                _ => Err(WalDecodeError {
                    msg: format!("Unknown version {}", self.pg_version),
                    lsn: self.lsn,
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
 // Multixact utils

 pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
-    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
-        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
-        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
 }

 pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
@@ -81,3 +81,41 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
 pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
    (mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_multixid_calc() {
+        // Check that the mx_offset_* functions produce the same values as the
+        // corresponding PostgreSQL C macros (MXOffsetTo*). These test values
+        // were generated by calling the PostgreSQL macros with a little C
+        // program.
+        assert_eq!(mx_offset_to_member_segment(0), 0);
+        assert_eq!(mx_offset_to_member_page(0), 0);
+        assert_eq!(mx_offset_to_flags_offset(0), 0);
+        assert_eq!(mx_offset_to_flags_bitshift(0), 0);
+        assert_eq!(mx_offset_to_member_offset(0), 4);
+        assert_eq!(mx_offset_to_member_segment(1), 0);
+        assert_eq!(mx_offset_to_member_page(1), 0);
+        assert_eq!(mx_offset_to_flags_offset(1), 0);
+        assert_eq!(mx_offset_to_flags_bitshift(1), 8);
+        assert_eq!(mx_offset_to_member_offset(1), 8);
+        assert_eq!(mx_offset_to_member_segment(123456789), 2358);
+        assert_eq!(mx_offset_to_member_page(123456789), 75462);
+        assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
+        assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
+        assert_eq!(mx_offset_to_member_offset(123456789), 4788);
+        assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
+        assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
+        assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
+        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
+        assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
+        assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
+        assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
+        assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
+        assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
+        assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
+    }
+}
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -1 +0,0 @@
-
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -52,7 +52,6 @@ impl Conf {
        match self.pg_version {
            14 => Ok(path.join(format!("v{}", self.pg_version))),
            15 => Ok(path.join(format!("v{}", self.pg_version))),
-            16 => Ok(path.join(format!("v{}", self.pg_version))),
            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// An error occured while parsing or serializing raw stream into Postgres
+/// An error occurred while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -42,6 +42,10 @@ workspace_hack.workspace = true

 const_format.workspace = true

+# to use tokio channels as streams, this is faster to compile than async_stream
+# why is it only here? no other crate should use it, streams are rarely needed.
+tokio-stream = { version = "0.1.14" }
+
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -16,7 +16,7 @@ use crate::id::TenantId;
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;

-#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -12,6 +12,13 @@ pub struct Completion(mpsc::Sender<()>);
 #[derive(Clone)]
 pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);

+impl Default for Barrier {
+    fn default() -> Self {
+        let (_, rx) = channel();
+        rx
+    }
+}
+
 impl Barrier {
    pub async fn wait(self) {
        self.0.lock().await.recv().await;
@@ -24,6 +31,15 @@ impl Barrier {
    }
 }

+impl PartialEq for Barrier {
+    fn eq(&self, other: &Self) -> bool {
+        // we don't use dyn so this is good
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+}
+
+impl Eq for Barrier {}
+
 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
    let (tx, rx) = mpsc::channel::<()>(1);
--- a/libs/utils/src/error.rs
+++ b/libs/utils/src/error.rs
@@ -0,0 +1,111 @@
+/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
+///
+/// It can be used with `anyhow::Error` as well.
+///
+/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
+/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
+/// formatting.
+///
+/// ## Usage
+///
+/// ```rust
+/// #[derive(Debug, thiserror::Error)]
+/// enum MyCoolError {
+///   #[error("should never happen")]
+///   Bad(#[source] std::io::Error),
+/// }
+///
+/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
+///
+/// # fn main() {
+/// use utils::error::report_compact_sources;
+///
+/// if let Err(e) = failing_call() {
+///     let e = report_compact_sources(&e);
+///     assert_eq!(format!("{e}"), "should never happen: permission denied");
+/// }
+/// # }
+/// ```
+///
+/// ## TODO
+///
+/// When we are able to describe return position impl trait in traits, this should of course be an
+/// extension trait. Until then avoid boxing with this more ackward interface.
+pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
+    struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
+
+    impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "{}", self.0)?;
+
+            // why is E a generic parameter here? hope that rustc will see through a default
+            // Error::source implementation and leave the following out if there cannot be any
+            // sources:
+            Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
+        }
+    }
+
+    struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
+
+    impl<'a> Iterator for Sources<'a> {
+        type Item = &'a (dyn std::error::Error + 'static);
+
+        fn next(&mut self) -> Option<Self::Item> {
+            let rem = self.0;
+
+            let next = self.0.and_then(|x| x.source());
+            self.0 = next;
+            rem
+        }
+    }
+
+    AnyhowDisplayAlternateAlike(e)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::report_compact_sources;
+
+    #[test]
+    fn report_compact_sources_examples() {
+        use std::fmt::Write;
+
+        #[derive(Debug, thiserror::Error)]
+        enum EvictionError {
+            #[error("cannot evict a remote layer")]
+            CannotEvictRemoteLayer,
+            #[error("stat failed")]
+            StatFailed(#[source] std::io::Error),
+            #[error("layer was no longer part of LayerMap")]
+            LayerNotFound(#[source] anyhow::Error),
+        }
+
+        let examples = [
+            (
+                line!(),
+                EvictionError::CannotEvictRemoteLayer,
+                "cannot evict a remote layer",
+            ),
+            (
+                line!(),
+                EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
+                "stat failed: permission denied",
+            ),
+            (
+                line!(),
+                EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
+                "layer was no longer part of LayerMap: foobar",
+            ),
+        ];
+
+        let mut s = String::new();
+
+        for (line, example, expected) in examples {
+            s.clear();
+
+            write!(s, "{}", report_compact_sources(&example)).expect("string grows");
+
+            assert_eq!(s, expected, "example on line {line}");
+        }
+    }
+}
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,12 +24,29 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

+pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
+    if e.kind() == io::ErrorKind::NotFound {
+        Ok(())
+    } else {
+        Err(e)
+    }
+}
+
+pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
+where
+    F: Fn() -> io::Result<()>,
+{
+    fs_operation().or_else(ignore_not_found)
+}
+
 #[cfg(test)]
 mod test {
    use std::path::PathBuf;

    use crate::fs_ext::is_directory_empty;

+    use super::ignore_absent_files;
+
    #[test]
    fn is_empty_dir() {
        use super::PathExt;
@@ -75,4 +92,21 @@ mod test {
        std::fs::remove_file(&file_path).unwrap();
        assert!(is_directory_empty(file_path).await.is_err());
    }
+
+    #[test]
+    fn ignore_absent_files_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        let file_path: PathBuf = dir_path.join("testfile");
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        let f = std::fs::File::create(&file_path).unwrap();
+        drop(f);
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        assert!(!file_path.exists());
+    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -9,7 +9,6 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::task::JoinError;
 use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
@@ -148,26 +147,140 @@ impl Drop for RequestCancelled {
 }

 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    use bytes::{Bytes, BytesMut};
+    use std::io::Write as _;
+    use tokio::sync::mpsc;
+    use tokio_stream::wrappers::ReceiverStream;
+
    SERVE_METRICS_COUNT.inc();

-    let mut buffer = vec![];
-    let encoder = TextEncoder::new();
+    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+    struct ChannelWriter {
+        buffer: BytesMut,
+        tx: mpsc::Sender<std::io::Result<Bytes>>,
+        written: usize,
+    }

-    let metrics = tokio::task::spawn_blocking(move || {
-        // Currently we take a lot of mutexes while collecting metrics, so it's
-        // better to spawn a blocking task to avoid blocking the event loop.
-        metrics::gather()
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
-    encoder.encode(&metrics, &mut buffer).unwrap();
+    impl ChannelWriter {
+        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+            assert_ne!(buf_len, 0);
+            ChannelWriter {
+                // split about half off the buffer from the start, because we flush depending on
+                // capacity. first flush will come sooner than without this, but now resizes will
+                // have better chance of picking up the "other" half. not guaranteed of course.
+                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+                tx,
+                written: 0,
+            }
+        }
+
+        fn flush0(&mut self) -> std::io::Result<usize> {
+            let n = self.buffer.len();
+            if n == 0 {
+                return Ok(0);
+            }
+
+            tracing::trace!(n, "flushing");
+            let ready = self.buffer.split().freeze();
+
+            // not ideal to call from blocking code to block_on, but we are sure that this
+            // operation does not spawn_blocking other tasks
+            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+                // throttle sending to allow reuse of our buffer in `write`.
+                self.tx.reserve().await.map_err(|_| ())?;
+
+                // now the response task has picked up the buffer and hopefully started
+                // sending it to the client.
+                Ok(())
+            });
+            if res.is_err() {
+                return Err(std::io::ErrorKind::BrokenPipe.into());
+            }
+            self.written += n;
+            Ok(n)
+        }
+
+        fn flushed_bytes(&self) -> usize {
+            self.written
+        }
+    }
+
+    impl std::io::Write for ChannelWriter {
+        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+            let remaining = self.buffer.capacity() - self.buffer.len();
+
+            let out_of_space = remaining < buf.len();
+
+            let original_len = buf.len();
+
+            if out_of_space {
+                let can_still_fit = buf.len() - remaining;
+                self.buffer.extend_from_slice(&buf[..can_still_fit]);
+                buf = &buf[can_still_fit..];
+                self.flush0()?;
+            }
+
+            // assume that this will often under normal operation just move the pointer back to the
+            // beginning of allocation, because previous split off parts are already sent and
+            // dropped.
+            self.buffer.extend_from_slice(buf);
+            Ok(original_len)
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.flush0().map(|_| ())
+        }
+    }
+
+    let started_at = std::time::Instant::now();
+
+    let (tx, rx) = mpsc::channel(1);
+
+    let body = Body::wrap_stream(ReceiverStream::new(rx));
+
+    let mut writer = ChannelWriter::new(128 * 1024, tx);
+
+    let encoder = TextEncoder::new();

    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, encoder.format_type())
-        .body(Body::from(buffer))
+        .body(body)
        .unwrap();

+    let span = info_span!("blocking");
+    tokio::task::spawn_blocking(move || {
+        let _span = span.entered();
+        let metrics = metrics::gather();
+        let res = encoder
+            .encode(&metrics, &mut writer)
+            .and_then(|_| writer.flush().map_err(|e| e.into()));
+
+        match res {
+            Ok(()) => {
+                tracing::info!(
+                    bytes = writer.flushed_bytes(),
+                    elapsed_ms = started_at.elapsed().as_millis(),
+                    "responded /metrics"
+                );
+            }
+            Err(e) => {
+                tracing::warn!("failed to write out /metrics response: {e:#}");
+                // semantics of this error are quite... unclear. we want to error the stream out to
+                // abort the response to somehow notify the client that we failed.
+                //
+                // though, most likely the reason for failure is that the receiver is already gone.
+                drop(
+                    writer
+                        .tx
+                        .blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
+                );
+            }
+        }
+    });
+
    Ok(response)
 }

--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,5 +1,7 @@
+use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

+use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -213,6 +215,18 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

+impl TryFrom<Option<&OsStr>> for TimelineId {
+    type Error = anyhow::Error;
+
+    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
+        value
+            .and_then(OsStr::to_str)
+            .unwrap_or_default()
+            .parse::<TimelineId>()
+            .with_context(|| format!("Could not parse timeline id from {:?}", value))
+    }
+}
+
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,6 +63,9 @@ pub mod rate_limit;
 /// Simple once-barrier and a guard which keeps barrier awaiting.
 pub mod completion;

+/// Reporting utilities
+pub mod error;
+
 mod failpoint_macro_helpers {

    /// use with fail::cfg("$name", "return(2000)")
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -164,9 +164,7 @@ fn tracing_subscriber_configured() -> bool {
    tracing::dispatcher::get_default(|d| {
        // it is possible that this closure will not be invoked, but the current implementation
        // always invokes it
-        noop_configured = d
-            .downcast_ref::<tracing::subscriber::NoSubscriber>()
-            .is_some();
+        noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
    });

    !noop_configured
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -35,6 +35,8 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
+# hack to get the number of worker threads tokio uses
+num_cpus = { version = "1.15" }
 num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
@@ -82,6 +84,7 @@ strum_macros.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tempfile.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -13,6 +13,7 @@ clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
 postgres_ffi.workspace = true
+tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -129,7 +129,7 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    Ok(holes)
 }

-pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

@@ -160,7 +160,7 @@ pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes)?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -43,8 +43,7 @@ pub(crate) enum LayerCmd {
    },
 }

-fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
-    use pageserver::tenant::blob_io::BlobCursor;
+async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    use pageserver::tenant::block_io::BlockReader;

    let path = path.as_ref();
@@ -78,7 +77,7 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    Ok(())
 }

-pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
+pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join("tenants"))? {
@@ -153,7 +152,7 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path())?;
+                            read_delta_file(layer.path()).await?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -72,12 +72,13 @@ struct AnalyzeLayerMapCmd {
    max_holes: Option<usize>,
 }

-fn main() -> anyhow::Result<()> {
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
    let cli = CliOpts::parse();

    match cli.command {
        Commands::Layer(cmd) => {
-            layers::main(&cmd)?;
+            layers::main(&cmd).await?;
        }
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
@@ -86,7 +87,7 @@ fn main() -> anyhow::Result<()> {
            draw_timeline_dir::main()?;
        }
        Commands::AnalyzeLayerMap(cmd) => {
-            layer_map_analyzer::main(&cmd)?;
+            layer_map_analyzer::main(&cmd).await?;
        }
        Commands::PrintLayerFile(cmd) => {
            if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -94,7 +95,7 @@ fn main() -> anyhow::Result<()> {
                    "Failed to read input file as a pg control one: {e:#}\n\
                    Attempting to read it as layer file"
                );
-                print_layerfile(&cmd.path)?;
+                print_layerfile(&cmd.path).await?;
            }
        }
    };
@@ -113,12 +114,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx)
+    dump_layerfile_from_path(path, true, &ctx).await
 }

 fn handle_metadata(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -19,12 +19,6 @@ use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;

-/// NB: This relies on a modified version of tokio_tar that does *not* write the
-/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
-/// without explicitly calling 'finish' or 'into_inner'!
-///
-/// See https://github.com/neondatabase/tokio-tar/pull/1
-///
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -396,8 +396,8 @@ fn start_pageserver(

            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));

-            let init_sizes_done = tokio::select! {
-                _ = &mut init_sizes_done => {
+            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
+                Ok(_) => {
                    let now = std::time::Instant::now();
                    tracing::info!(
                        from_init_done_millis = (now - init_done).as_millis(),
@@ -406,7 +406,7 @@ fn start_pageserver(
                    );
                    None
                }
-                _ = tokio::time::sleep(timeout) => {
+                Err(_) => {
                    tracing::info!(
                        timeout_millis = timeout.as_millis(),
                        "Initial logical size timeout elapsed; starting background jobs"
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,7 +33,8 @@ use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -601,6 +602,17 @@ impl PageServerConf {
        )
    }

+    pub fn timeline_delete_mark_file_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&tenant_id, &timeline_id),
+            TIMELINE_DELETE_MARK_SUFFIX,
+        )
+    }
+
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
@@ -655,7 +667,6 @@ impl PageServerConf {
        match pg_version {
            14 => Ok(path.join(format!("v{pg_version}"))),
            15 => Ok(path.join(format!("v{pg_version}"))),
-            16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
@@ -664,7 +675,6 @@ impl PageServerConf {
        match pg_version {
            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
@@ -672,7 +682,6 @@ impl PageServerConf {
        match pg_version {
            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, Timeline},
+    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
        .await;

        let sleep_until = start + task_config.period;
-        tokio::select! {
-            _ = tokio::time::sleep_until(sleep_until) => {},
-            _ = cancel.cancelled() => {
-                break
-            }
+        if tokio::time::timeout_at(sleep_until, cancel.cancelled())
+            .await
+            .is_ok()
+        {
+            break;
        }
    }
 }
@@ -390,13 +390,22 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    assert_eq!(results.len(), batch.len());
                    for (result, layer) in results.into_iter().zip(batch.iter()) {
                        match result {
-                            Some(Ok(true)) => {
+                            Some(Ok(())) => {
                                usage_assumed.add_available_bytes(layer.file_size());
                            }
-                            Some(Ok(false)) => {
-                                // this is:
-                                // - Replacement::{NotFound, Unexpected}
-                                // - it cannot be is_remote_layer, filtered already
+                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
+                            }
+                            Some(Err(EvictionError::FileNotFound)) => {
+                                evictions_failed.file_sizes += layer.file_size();
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(
+                                e @ EvictionError::LayerNotFound(_)
+                                | e @ EvictionError::StatFailed(_),
+                            )) => {
+                                let e = utils::error::report_compact_sources(&e);
+                                warn!(%layer, "failed to evict layer: {e}");
                                evictions_failed.file_sizes += layer.file_size();
                                evictions_failed.count += 1;
                            }
@@ -404,10 +413,6 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                                assert!(cancel.is_cancelled());
                                return;
                            }
-                            Some(Err(e)) => {
-                                // we really shouldn't be getting this, precondition failure
-                                error!("failed to evict layer: {:#}", e);
-                            }
                        }
                    }
                }
@@ -540,12 +545,12 @@ async fn collect_eviction_candidates(
        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
        // That's what's typically used by the various background loops.
        //
-        // The default can be overriden with a fixed value in the tenant conf.
+        // The default can be overridden with a fixed value in the tenant conf.
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
                tenant_id=%tenant.tenant_id(),
-                overriden_size=s,
+                overridden_size=s,
                "using overridden min resident size for tenant"
            );
            s
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -994,31 +994,29 @@ async fn timeline_gc_handler(
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
-        .await
-        .context("spawn compaction task")
-        .map_err(ApiError::InternalServerError)?;
-
-    let result: anyhow::Result<()> = result_receiver
-        .await
-        .context("receive compaction result")
-        .map_err(ApiError::InternalServerError)?;
-    result.map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        timeline
+            .compact(&cancel, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
+    .await
 }

 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1031,13 +1029,13 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&ctx)
+            .compact(&cancel, &ctx)
            .await
            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
+    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
    .await
 }

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -109,6 +109,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

+pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -123,15 +125,30 @@ pub fn is_temporary(path: &Path) -> bool {
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
+fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
    match path.file_name() {
-        Some(name) => name
-            .to_string_lossy()
-            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
+        Some(name) => name.to_string_lossy().ends_with(suffix),
        None => false,
    }
 }

+pub fn is_uninit_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
+}
+
+pub fn is_delete_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
+}
+
+fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
+    if let Some(e) = e.io_error() {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            return true;
+        }
+    }
+    false
+}
+
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -6,7 +6,6 @@ use metrics::{
    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
-use pageserver_api::models::TenantState;
 use strum::VariantNames;
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -74,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -84,18 +83,17 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_read_num_fs_layers",
        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        &["tenant_id", "timeline_id"],
        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -104,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -112,17 +110,16 @@ pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
-        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -246,11 +243,10 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
-        &["tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -284,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -292,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -309,16 +305,29 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
        "Count of tenants per state",
-        &["tenant_id", "state"]
+        &["state"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+/// A set of broken tenants.
+///
+/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
+/// tenant.
+pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_broken_tenants_count",
+        "Set of broken tenants",
+        &["tenant_id"]
+    )
+    .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
+pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -376,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -499,23 +508,31 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
-    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
-];
-
-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+/// Tracks time taken by fs operations near VirtualFile.
+///
+/// Operations:
+/// - open ([`std::fs::OpenOptions::open`])
+/// - close (dropping [`std::fs::File`])
+/// - close-by-replace (close by replacement algorithm)
+/// - read (`read_at`)
+/// - write (`write_at`)
+/// - seek (modify internal position or file length query)
+/// - fsync ([`std::fs::File::sync_all`])
+/// - metadata ([`std::fs::File::metadata`])
+pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
    .expect("failed to define a metric")
 });

-pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
+pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -605,7 +622,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
         at a given instant. It gives you a better idea of the queue depth \
         than plotting the gauge directly, since operations may complete faster \
         than the sampling interval.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["file_kind", "op_kind"],
        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
    )
@@ -662,18 +679,18 @@ impl RemoteOpFileKind {
    }
 }

-pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_remote_operation_seconds",
        "Time spent on remote storage operations. \
        Grouped by tenant, timeline, operation_kind and status. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
+        &["file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });

-pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -682,7 +699,7 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -693,7 +710,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new

 // walreceiver metrics

-pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -701,7 +718,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -709,7 +726,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -718,7 +735,7 @@ pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -726,7 +743,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -735,10 +752,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -785,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -794,7 +811,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -803,7 +820,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -812,7 +829,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -821,7 +838,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
+pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -897,7 +915,6 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
-    pub get_reconstruct_data_time_histo: Histogram,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -906,9 +923,7 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
-    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -925,9 +940,6 @@ impl TimelineMetrics {
    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
-        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
        let compact_time_histo =
@@ -948,9 +960,6 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let wait_lsn_time_histo = WAIT_LSN_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -966,16 +975,12 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let read_num_fs_layers = READ_NUM_FS_LAYERS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
            timeline_id,
-            get_reconstruct_data_time_histo,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -984,7 +989,6 @@ impl TimelineMetrics {
            garbage_collect_histo,
            load_layer_map_histo,
            last_record_gauge,
-            wait_lsn_time_histo,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
@@ -993,7 +997,6 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
-            read_num_fs_layers,
        }
    }
 }
@@ -1002,15 +1005,12 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
-        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1022,9 +1022,6 @@ impl Drop for TimelineMetrics {
            let _ =
                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }
-        for op in STORAGE_IO_TIME_OPERATIONS {
-            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
-        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1039,9 +1036,7 @@ impl Drop for TimelineMetrics {
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    for state in TenantState::VARIANTS {
-        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
-    }
+    // we leave the BROKEN_TENANTS_SET entry if any
 }

 use futures::Future;
@@ -1056,9 +1051,7 @@ pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
-    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
-    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1068,14 +1061,13 @@ impl RemoteTimelineClientMetrics {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
-            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
-            calls_started_hist: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
+
    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
        guard
@@ -1089,26 +1081,17 @@ impl RemoteTimelineClientMetrics {
            })
            .clone()
    }
+
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
-        let mut guard = self.remote_operation_time.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_OPERATION_TIME
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                    key.2,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_OPERATION_TIME
+            .get_metric_with_label_values(&[key.0, key.1, key.2])
+            .unwrap()
    }

    fn calls_unfinished_gauge(
@@ -1136,19 +1119,10 @@ impl RemoteTimelineClientMetrics {
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
    ) -> Histogram {
-        let mut guard = self.calls_started_hist.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+            .get_metric_with_label_values(&[key.0, key.1])
+            .unwrap()
    }

    fn bytes_started_counter(
@@ -1328,15 +1302,10 @@ impl Drop for RemoteTimelineClientMetrics {
            tenant_id,
            timeline_id,
            remote_physical_size_gauge,
-            remote_operation_time,
            calls_unfinished_gauge,
-            calls_started_hist,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
-        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
-            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
-        }
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
@@ -1345,14 +1314,6 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
-        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
-            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
-                tenant_id,
-                timeline_id,
-                a,
-                b,
-            ]);
-        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
@@ -1434,15 +1395,51 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // We want to alert on this metric increasing.
-    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
-    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
-    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+    // Python tests need these and on some we do alerting.
+    //
+    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
+    // order:
+    // - global metrics reside in a Lazy<PageserverMetrics>
+    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    // - could move the statics into TimelineMetrics::new()?

-    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
-    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+    // counters
+    [
+        &MATERIALIZED_PAGE_CACHE_HIT,
+        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+        &UNEXPECTED_ONDEMAND_DOWNLOADS,
+        &WALRECEIVER_STARTED_CONNECTIONS,
+        &WALRECEIVER_BROKER_UPDATES,
+        &WALRECEIVER_CANDIDATES_ADDED,
+        &WALRECEIVER_CANDIDATES_REMOVED,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });

-    // Python tests need these.
-    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-    MATERIALIZED_PAGE_CACHE_HIT.get();
+    // countervecs
+    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
+        .into_iter()
+        .for_each(|c| {
+            Lazy::force(c);
+        });
+
+    // gauges
+    WALRECEIVER_ACTIVE_MANAGERS.get();
+
+    // histograms
+    [
+        &READ_NUM_FS_LAYERS,
+        &RECONSTRUCT_TIME,
+        &WAIT_LSN_TIME,
+        &WAL_REDO_TIME,
+        &WAL_REDO_WAIT_TIME,
+        &WAL_REDO_RECORDS_HISTOGRAM,
+        &WAL_REDO_BYTES_HISTOGRAM,
+    ]
+    .into_iter()
+    .for_each(|h| {
+        Lazy::force(h);
+    });
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -130,11 +130,25 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
+        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });

+pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
+    // force init and thus panics
+    let _ = BACKGROUND_RUNTIME.handle();
+    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
+    // tokio would had already panicked for parsing errors or NotUnicode
+    //
+    // this will be wrong if any of the runtimes gets their worker threads configured to something
+    // else, but that has not been needed in a long time.
+    std::env::var("TOKIO_WORKER_THREADS")
+        .map(|s| s.parse::<usize>().unwrap())
+        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
+});
+
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

@@ -511,17 +525,13 @@ pub async fn shutdown_tasks(
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
-            let join_handle = tokio::select! {
-                biased;
-                _ = &mut join_handle => { None },
-                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
-                    // allow some time to elapse before logging to cut down the number of log
-                    // lines.
-                    info!("waiting for {} to shut down", task.name);
-                    Some(join_handle)
-                }
-            };
-            if let Some(join_handle) = join_handle {
+            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
+                .await
+                .is_err()
+            {
+                // allow some time to elapse before logging to cut down the number of log
+                // lines.
+                info!("waiting for {} to shut down", task.name);
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
@@ -549,7 +559,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
 pub async fn shutdown_watcher() {
    let token = SHUTDOWN_TOKEN
        .try_with(|t| t.clone())
-        .expect("shutdown_requested() called in an unexpected task or thread");
+        .expect("shutdown_watcher() called in an unexpected task or thread");

    token.cancelled().await;
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -16,29 +16,19 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

-/// For reading
-pub trait BlobCursor {
+impl<R> BlockCursor<R>
+where
+    R: BlockReader,
+{
    /// Read a blob into a new buffer.
-    fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
    }
-
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    fn read_blob_into_buf(
-        &mut self,
-        offset: u64,
-        dstbuf: &mut Vec<u8>,
-    ) -> Result<(), std::io::Error>;
-}
-
-impl<R> BlobCursor for BlockCursor<R>
-where
-    R: BlockReader,
-{
-    fn read_blob_into_buf(
+    pub fn read_blob_into_buf(
        &mut self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -0,0 +1,574 @@
+use std::{
+    ops::{Deref, DerefMut},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TimelineState;
+use tokio::sync::OwnedMutexGuard;
+use tracing::{debug, error, info, instrument, warn, Instrument, Span};
+use utils::{
+    crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind},
+    tenant::{remote_timeline_client, DeleteTimelineError},
+    InitializationOrder,
+};
+
+use super::{
+    metadata::TimelineMetadata,
+    remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
+    CreateTimelineCause, Tenant, Timeline,
+};
+
+/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    // Stop the walreceiver first.
+    debug!("waiting for wal receiver to shutdown");
+    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+    if let Some(walreceiver) = maybe_started_walreceiver {
+        walreceiver.stop().await;
+    }
+    debug!("wal receiver shutdown confirmed");
+
+    // Prevent new uploads from starting.
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        let res = remote_client.stop();
+        match res {
+            Ok(()) => {}
+            Err(e) => match e {
+                remote_timeline_client::StopError::QueueUninitialized => {
+                    // This case shouldn't happen currently because the
+                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                    // That is, before we declare the Tenant as Active.
+                    // But we only allow calls to delete_timeline on Active tenants.
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                }
+            },
+        }
+    }
+
+    // Stop & wait for the remaining timeline tasks, including upload tasks.
+    // NB: This and other delete_timeline calls do not run as a task_mgr task,
+    //     so, they are not affected by this shutdown_tasks() call.
+    info!("waiting for timeline tasks to shutdown");
+    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
+
+    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-index-deleted-at"
+        ))?
+    });
+    Ok(())
+}
+
+/// Mark timeline as deleted in S3 so we won't pick it up next time
+/// during attach or pageserver restart.
+/// See comment in persist_index_part_with_deleted_flag.
+async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        match remote_client.persist_index_part_with_deleted_flag().await {
+            // If we (now, or already) marked it successfully as deleted, we can proceed
+            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+            // Bail out otherwise
+            //
+            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+            // two tasks from performing the deletion at the same time. The first task
+            // that starts deletion should run it to completion.
+            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+            }
+        }
+    }
+    Ok(())
+}
+
+// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
+// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
+// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
+// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
+// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
+// So we can just remove the mark file.
+async fn create_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<(), DeleteTimelineError> {
+    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-delete-mark"
+        ))?
+    });
+    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+    Ok(())
+}
+
+/// Grab the layer_removal_cs lock, and actually perform the deletion.
+///
+/// This lock prevents prevents GC or compaction from running at the same time.
+/// The GC task doesn't register itself with the timeline it's operating on,
+/// so it might still be running even though we called `shutdown_tasks`.
+///
+/// Note that there are still other race conditions between
+/// GC, compaction and timeline deletion. See
+/// <https://github.com/neondatabase/neon/issues/2671>
+///
+/// No timeout here, GC & Compaction should be responsive to the
+/// `TimelineState::Stopping` change.
+async fn delete_local_layer_files(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline: &Timeline,
+) -> anyhow::Result<()> {
+    info!("waiting for layer_removal_cs.lock()");
+    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+    info!("got layer_removal_cs.lock(), deleting layer files");
+
+    // NB: storage_sync upload tasks that reference these layers have been cancelled
+    //     by the caller.
+
+    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
+
+    fail::fail_point!("timeline-delete-before-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+    });
+
+    // NB: This need not be atomic because the deleted flag in the IndexPart
+    // will be observed during tenant/timeline load. The deletion will be resumed there.
+    //
+    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
+    //
+    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
+    // This can happen if we're called a second time, e.g.,
+    // because of a previous failure/cancellation at/after
+    // failpoint timeline-delete-after-rm.
+    //
+    // It can also happen if we race with tenant detach, because,
+    // it doesn't grab the layer_removal_cs lock.
+    //
+    // For now, log and continue.
+    // warn! level is technically not appropriate for the
+    // first case because we should expect retries to happen.
+    // But the error is so rare, it seems better to get attention if it happens.
+    //
+    // Note that metadata removal is skipped, this is not technically needed,
+    // but allows to reuse timeline loading code during resumed deletion.
+    // (we always expect that metadata is in place when timeline is being loaded)
+
+    #[cfg(feature = "testing")]
+    let mut counter = 0;
+
+    // Timeline directory may not exist if we failed to delete mark file and request was retried.
+    if !local_timeline_directory.exists() {
+        return Ok(());
+    }
+
+    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
+
+    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
+        #[cfg(feature = "testing")]
+        {
+            counter += 1;
+            if counter == 2 {
+                fail::fail_point!("timeline-delete-during-rm", |_| {
+                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
+                });
+            }
+        }
+
+        let entry = entry?;
+        if entry.path() == metadata_path {
+            debug!("found metadata, skipping");
+            continue;
+        }
+
+        if entry.path() == local_timeline_directory {
+            // Keeping directory because metedata file is still there
+            debug!("found timeline dir itself, skipping");
+            continue;
+        }
+
+        let metadata = match entry.metadata() {
+            Ok(metadata) => metadata,
+            Err(e) => {
+                if crate::is_walkdir_io_not_found(&e) {
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        path=?entry.path().display(),
+                        "got not found err while removing timeline dir, proceeding anyway"
+                    );
+                    continue;
+                }
+                anyhow::bail!(e);
+            }
+        };
+
+        let r = if metadata.is_dir() {
+            // There shouldnt be any directories inside timeline dir as of current layout.
+            tokio::fs::remove_dir(entry.path()).await
+        } else {
+            tokio::fs::remove_file(entry.path()).await
+        };
+
+        if let Err(e) = r {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                warn!(
+                    timeline_dir=?local_timeline_directory,
+                    path=?entry.path().display(),
+                    "got not found err while removing timeline dir, proceeding anyway"
+                );
+                continue;
+            }
+            anyhow::bail!(anyhow::anyhow!(
+                "Failed to remove: {}. Error: {e}",
+                entry.path().display()
+            ));
+        }
+    }
+
+    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+    drop(layer_removal_guard);
+
+    fail::fail_point!("timeline-delete-after-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+    });
+
+    Ok(())
+}
+
+/// Removes remote layers and an index file after them.
+async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
+    if let Some(remote_client) = &timeline.remote_client {
+        remote_client.delete_all().await.context("delete_all")?
+    };
+
+    Ok(())
+}
+
+// This function removs remaining traces of a timeline on disk.
+// Namely: metadata file, timeline directory, delete mark.
+// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
+// delete mark should be present because it is the last step during deletion.
+// (nothing can fail after its deletion)
+async fn cleanup_remaining_timeline_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> anyhow::Result<()> {
+    // Remove local metadata
+    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove metadata")?;
+
+    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-after-rm-metadata"
+        ))?
+    });
+
+    // Remove timeline dir
+    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("timeline dir")?;
+
+    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
+    });
+
+    // Remove delete mark
+    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
+        .await
+        .context("remove delete mark")
+}
+
+/// It is important that this gets called when DeletionGuard is being held.
+/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+async fn remove_timeline_from_tenant(
+    tenant: &Tenant,
+    timeline_id: TimelineId,
+    _: &DeletionGuard, // using it as a witness
+) -> anyhow::Result<()> {
+    // Remove the timeline from the map.
+    let mut timelines = tenant.timelines.lock().unwrap();
+    let children_exist = timelines
+        .iter()
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+    // We already deleted the layer files, so it's probably best to panic.
+    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+    if children_exist {
+        panic!("Timeline grew children while we removed layer files");
+    }
+
+    timelines
+        .remove(&timeline_id)
+        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
+
+    drop(timelines);
+
+    Ok(())
+}
+
+/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Set deleted_at in remote index part.
+/// 2. Create local mark file.
+/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
+/// 4. Delete remote layers
+/// 5. Delete index part
+/// 6. Delete meta, timeline directory
+/// 7. Delete mark file
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
+/// and we possibly neeed to continue deletion of remote files.
+/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
+/// index but still have local metadata, timeline directory and delete mark.
+/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
+#[derive(Default)]
+pub enum DeleteTimelineFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTimelineFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    pub async fn run(
+        tenant: &Arc<Tenant>,
+        timeline_id: TimelineId,
+    ) -> Result<(), DeleteTimelineError> {
+        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
+
+        guard.mark_in_progress()?;
+
+        stop_tasks(&timeline).await?;
+
+        set_deleted_in_remote_index(&timeline).await?;
+
+        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
+
+        fail::fail_point!("timeline-delete-before-schedule", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-schedule"
+            ))?
+        });
+
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    pub async fn resume_deletion(
+        tenant: Arc<Tenant>,
+        timeline_id: TimelineId,
+        local_metadata: &TimelineMetadata,
+        remote_client: Option<RemoteTimelineClient>,
+        init_order: Option<&InitializationOrder>,
+    ) -> anyhow::Result<()> {
+        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
+        // RemoteTimelineClient is the only functioning part.
+        let timeline = tenant
+            .create_timeline_struct(
+                timeline_id,
+                local_metadata,
+                None, // Ancestor is not needed for deletion.
+                remote_client,
+                init_order,
+                // Important. We dont pass ancestor above because it can be missing.
+                // Thus we need to skip the validation here.
+                CreateTimelineCause::Delete,
+            )
+            .context("create_timeline_struct")?;
+
+        let mut guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .expect("cannot happen because we're the only owner"),
+        );
+
+        // We meed to do this because when console retries delete request we shouldnt answer with 404
+        // because 404 means successful deletion.
+        {
+            let mut locked = tenant.timelines.lock().unwrap();
+            locked.insert(timeline_id, Arc::clone(&timeline));
+        }
+
+        guard.mark_in_progress()?;
+
+        // Note that delete mark can be missing on resume
+        // because we create delete mark after we set deleted_at in the index part.
+        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
+
+        Self::schedule_background(guard, tenant.conf, tenant, timeline);
+
+        Ok(())
+    }
+
+    pub async fn cleanup_remaining_timeline_fs_traces(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+    }
+
+    fn prepare(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
+        // Note the interaction between this guard and deletion guard.
+        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
+        // This is important because when you take into account `remove_timeline_from_tenant`
+        // we remove timeline from memory when we still hold the deletion guard.
+        // So here when timeline deletion is finished timeline wont be present in timelines map at all
+        // which makes the following sequence impossible:
+        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
+        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
+        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
+        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
+        let timelines = tenant.timelines.lock().unwrap();
+
+        let timeline = match timelines.get(&timeline_id) {
+            Some(t) => t,
+            None => return Err(DeleteTimelineError::NotFound),
+        };
+
+        // Ensure that there are no child timelines **attached to that pageserver**,
+        // because detach removes files, which will break child branches
+        let children: Vec<TimelineId> = timelines
+            .iter()
+            .filter_map(|(id, entry)| {
+                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
+                    Some(*id)
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        if !children.is_empty() {
+            return Err(DeleteTimelineError::HasChildren(children));
+        }
+
+        // Note that using try_lock here is important to avoid a deadlock.
+        // Here we take lock on timelines and then the deletion guard.
+        // At the end of the operation we're holding the guard and need to lock timelines map
+        // to remove the timeline from it.
+        // Always if you have two locks that are taken in different order this can result in a deadlock.
+        let delete_lock_guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+        );
+
+        timeline.set_state(TimelineState::Stopping);
+
+        Ok((Arc::clone(timeline), delete_lock_guard))
+    }
+
+    fn schedule_background(
+        guard: DeletionGuard,
+        conf: &'static PageServerConf,
+        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
+    ) {
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            Some(timeline_id),
+            "timeline_delete",
+            false,
+            async move {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
+                    error!("Error: {err:#}");
+                    timeline.set_broken(format!("{err:#}"))
+                };
+                Ok(())
+            }
+            .instrument({
+                let span =
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: DeletionGuard,
+        conf: &PageServerConf,
+        tenant: &Tenant,
+        timeline: &Timeline,
+    ) -> Result<(), DeleteTimelineError> {
+        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
+
+        delete_remote_layers_and_index(timeline).await?;
+
+        pausable_failpoint!("in_progress_delete");
+
+        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
+
+        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
+
+        *guard.0 = Self::Finished;
+
+        Ok(())
+    }
+}
+
+struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
+
+impl Deref for DeletionGuard {
+    type Target = DeleteTimelineFlow;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for DeletionGuard {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -328,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::blob_io::{BlobCursor, BlobWriter};
+    use crate::tenant::blob_io::BlobWriter;
    use crate::tenant::block_io::BlockCursor;
    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -626,17 +626,17 @@ impl LayerMap {

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
-    pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!("Begin dump LayerMap");

        println!("open_layer:");
        if let Some(open_layer) = &self.open_layer {
-            open_layer.dump(verbose, ctx)?;
+            open_layer.dump(verbose, ctx).await?;
        }

        println!("frozen_layers:");
        for frozen_layer in self.frozen_layers.iter() {
-            frozen_layer.dump(verbose, ctx)?;
+            frozen_layer.dump(verbose, ctx).await?;
        }

        println!("historic_layers:");
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -9,10 +9,11 @@
 //! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
-use std::io::Write;
+use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
+use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -267,24 +268,24 @@ pub fn save_metadata(
    Ok(())
 }

+#[derive(Error, Debug)]
+pub enum LoadMetadataError {
+    #[error(transparent)]
+    Read(#[from] io::Error),
+
+    #[error(transparent)]
+    Decode(#[from] anyhow::Error),
+}
+
 pub fn load_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-) -> anyhow::Result<TimelineMetadata> {
+) -> Result<TimelineMetadata, LoadMetadataError> {
    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
-        format!(
-            "Failed to read metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })?;
-    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
-        format!(
-            "Failed to parse metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })
+    let metadata_bytes = std::fs::read(metadata_path)?;
+
+    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,6 +26,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

+use super::delete::DeleteTimelineFlow;
+
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 enum TenantsMap {
@@ -233,11 +235,17 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
-#[instrument]
+#[instrument(skip_all)]
 pub async fn shutdown_all_tenants() {
+    shutdown_all_tenants0(&TENANTS).await
+}
+
+async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
+    use utils::completion;
+
    // Prevent new tenants from being created.
    let tenants_to_shut_down = {
-        let mut m = TENANTS.write().await;
+        let mut m = tenants.write().await;
        match &mut *m {
            TenantsMap::Initializing => {
                *m = TenantsMap::ShuttingDown(HashMap::default());
@@ -262,14 +270,41 @@ pub async fn shutdown_all_tenants() {
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                let freeze_and_flush = true;
+                // ordering shouldn't matter for this, either we store true right away or never
+                let ordering = std::sync::atomic::Ordering::Relaxed;
+                let joined_other = std::sync::atomic::AtomicBool::new(false);

-                match tenant.shutdown(freeze_and_flush).await {
-                    Ok(()) => debug!("tenant successfully stopped"),
-                    Err(super::ShutdownError::AlreadyStopping) => {
-                        warn!("tenant was already shutting down")
+                let mut shutdown = std::pin::pin!(async {
+                    let freeze_and_flush = true;
+
+                    let res = {
+                        let (_guard, shutdown_progress) = completion::channel();
+                        tenant.shutdown(shutdown_progress, freeze_and_flush).await
+                    };
+
+                    if let Err(other_progress) = res {
+                        // join the another shutdown in progress
+                        joined_other.store(true, ordering);
+                        other_progress.wait().await;
                    }
-                }
+                });
+
+                // in practice we might not have a lot time to go, since systemd is going to
+                // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
+                // a warning.
+                let warning = std::time::Duration::from_secs(5);
+                let mut warning = std::pin::pin!(tokio::time::sleep(warning));
+
+                tokio::select! {
+                    _ = &mut shutdown => {},
+                    _ = &mut warning => {
+                        let joined_other = joined_other.load(ordering);
+                        warn!(%joined_other, "waiting for the shutdown to complete");
+                        shutdown.await;
+                    }
+                };
+
+                debug!("tenant successfully stopped");
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
@@ -388,12 +423,10 @@ pub enum DeleteTimelineError {
 pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    ctx: &RequestContext,
+    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    tenant
-        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
-        .await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
    Ok(())
 }

@@ -413,6 +446,15 @@ pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    detach_ignored: bool,
+) -> Result<(), TenantStateError> {
+    detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
+}
+
+async fn detach_tenant0(
+    conf: &'static PageServerConf,
+    tenants: &tokio::sync::RwLock<TenantsMap>,
+    tenant_id: TenantId,
+    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -425,7 +467,8 @@ pub async fn detach_tenant(
    };

    let removal_result =
-        remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
+        remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
+            .await;

    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -472,7 +515,15 @@ pub async fn ignore_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> Result<(), TenantStateError> {
-    remove_tenant_from_memory(tenant_id, async {
+    ignore_tenant0(conf, &TENANTS, tenant_id).await
+}
+
+async fn ignore_tenant0(
+    conf: &'static PageServerConf,
+    tenants: &tokio::sync::RwLock<TenantsMap>,
+    tenant_id: TenantId,
+) -> Result<(), TenantStateError> {
+    remove_tenant_from_memory(tenants, tenant_id, async {
        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
        fs::File::create(&ignore_mark_file)
            .await
@@ -597,18 +648,21 @@ where
 /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
 /// operation would be needed to remove it.
 async fn remove_tenant_from_memory<V, F>(
+    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    tenant_cleanup: F,
 ) -> Result<V, TenantStateError>
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
+    use utils::completion;
+
    // It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
    let tenant = {
-        TENANTS
+        tenants
            .write()
            .await
            .get(&tenant_id)
@@ -616,14 +670,20 @@ where
            .ok_or(TenantStateError::NotFound(tenant_id))?
    };

+    // allow pageserver shutdown to await for our completion
+    let (_guard, progress) = completion::channel();
+
+    // whenever we remove a tenant from memory, we don't want to flush and wait for upload
    let freeze_and_flush = false;

    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
    // that we can continue safely to cleanup.
-    match tenant.shutdown(freeze_and_flush).await {
+    match tenant.shutdown(progress, freeze_and_flush).await {
        Ok(()) => {}
-        Err(super::ShutdownError::AlreadyStopping) => {
-            return Err(TenantStateError::IsStopping(tenant_id))
+        Err(_other) => {
+            // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
+            // wait for it but return an error right away because these are distinct requests.
+            return Err(TenantStateError::IsStopping(tenant_id));
        }
    }

@@ -632,14 +692,14 @@ where
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
    {
        Ok(hook_value) => {
-            let mut tenants_accessor = TENANTS.write().await;
+            let mut tenants_accessor = tenants.write().await;
            if tenants_accessor.remove(&tenant_id).is_none() {
                warn!("Tenant {tenant_id} got removed from memory before operation finished");
            }
            Ok(hook_value)
        }
        Err(e) => {
-            let tenants_accessor = TENANTS.read().await;
+            let tenants_accessor = tenants.read().await;
            match tenants_accessor.get(&tenant_id) {
                Some(tenant) => {
                    tenant.set_broken(e.to_string()).await;
@@ -708,51 +768,108 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

-pub async fn immediate_compact(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ctx: &RequestContext,
-) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
-    let guard = TENANTS.read().await;
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::sync::Arc;
+    use tracing::{info_span, Instrument};

-    let tenant = guard
-        .get(&tenant_id)
-        .map(Arc::clone)
-        .with_context(|| format!("tenant {tenant_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+    use super::{super::harness::TenantHarness, TenantsMap};

-    let timeline = tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+    #[tokio::test(start_paused = true)]
+    async fn shutdown_joins_remove_tenant_from_memory() {
+        // the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
+        // sure `shutdown_all_tenants0` per-tenant processing joins in any active
+        // remove_tenant_from_memory calls, which is enforced by making the operation last until
+        // we've ran `shutdown_all_tenants0` for a long time.

-    // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
-    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
-    task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
-        TaskKind::Compaction,
-        Some(tenant_id),
-        Some(timeline_id),
-        &format!(
-            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
-        ),
-        false,
-        async move {
-            let result = timeline
-                .compact(&ctx)
-                .instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
-                .await;
+        let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
+            .unwrap()
+            .load()
+            .await;

-            match task_done.send(result) {
-                Ok(_) => (),
-                Err(result) => error!("failed to send compaction result: {result:?}"),
-            }
-            Ok(())
-        },
-    );
+        // harness loads it to active, which is forced and nothing is running on the tenant

-    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
-    drop(guard);
+        let id = t.tenant_id();

-    Ok(wait_task_done)
+        // tenant harness configures the logging and we cannot escape it
+        let _e = info_span!("testing", tenant_id = %id).entered();
+
+        let tenants = HashMap::from([(id, t.clone())]);
+        let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
+
+        let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
+        let (until_cleanup_started, cleanup_started) = utils::completion::channel();
+
+        // start a "detaching operation", which will take a while, until can_complete_cleanup
+        let cleanup_task = {
+            let jh = tokio::spawn({
+                let tenants = tenants.clone();
+                async move {
+                    let cleanup = async move {
+                        drop(until_cleanup_started);
+                        can_complete_cleanup.wait().await;
+                        anyhow::Ok(())
+                    };
+                    super::remove_tenant_from_memory(&tenants, id, cleanup).await
+                }
+                .instrument(info_span!("foobar", tenant_id = %id))
+            });
+
+            // now the long cleanup should be in place, with the stopping state
+            cleanup_started.wait().await;
+            jh
+        };
+
+        let mut cleanup_progress = std::pin::pin!(t
+            .shutdown(utils::completion::Barrier::default(), false)
+            .await
+            .unwrap_err()
+            .wait());
+
+        let mut shutdown_task = {
+            let (until_shutdown_started, shutdown_started) = utils::completion::channel();
+
+            let shutdown_task = tokio::spawn(async move {
+                drop(until_shutdown_started);
+                super::shutdown_all_tenants0(&tenants).await;
+            });
+
+            shutdown_started.wait().await;
+            shutdown_task
+        };
+
+        // if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
+        // get to complete within timeout and fail the test. it is expected to continue awaiting
+        // until completion or SIGKILL during normal shutdown.
+        //
+        // the timeout is long to cover anything that shutdown_task could be doing, but it is
+        // handled instantly because we use tokio's time pausing in this test. 100s is much more than
+        // what we get from systemd on shutdown (10s).
+        let long_time = std::time::Duration::from_secs(100);
+        tokio::select! {
+            _ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
+            _ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
+            _ = tokio::time::sleep(long_time) => {},
+        }
+
+        // allow the remove_tenant_from_memory and thus eventually the shutdown to continue
+        drop(until_cleanup_completed);
+
+        let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
+        je.expect("Tenant::shutdown shutdown not have panicked");
+        cleanup_task
+            .await
+            .expect("no panicking")
+            .expect("remove_tenant_from_memory failed");
+
+        futures::future::poll_immediate(
+            t.shutdown(utils::completion::Barrier::default(), false)
+                .await
+                .unwrap_err()
+                .wait(),
+        )
+        .await
+        .expect("the stopping progress must still be complete");
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -514,7 +514,7 @@ impl RemoteTimelineClient {
    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
-    /// won't be performed until all previosuly scheduled layer file
+    /// won't be performed until all previously scheduled layer file
    /// upload operations have completed successfully.  This is to
    /// ensure that when the index file claims that layers X, Y and Z
    /// exist in remote storage, they really do. To wait for the upload
@@ -625,7 +625,7 @@ impl RemoteTimelineClient {
    /// Note: This schedules an index file upload before the deletions.  The
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
-    /// succesfully.
+    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
@@ -827,7 +827,7 @@ impl RemoteTimelineClient {
            )
        };

-        receiver.changed().await?;
+        receiver.changed().await.context("upload queue shut down")?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -855,11 +855,23 @@ impl RemoteTimelineClient {
            self.storage_impl.delete_objects(&remaining).await?;
        }

+        fail::fail_point!("timeline-delete-before-index-delete", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-index-delete"
+            ))?
+        });
+
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;

+        fail::fail_point!("timeline-delete-after-index-delete", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-after-index-delete"
+            ))?
+        });
+
        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");

        Ok(())
@@ -1105,7 +1117,7 @@ impl RemoteTimelineClient {
            debug!("remote task {} completed successfully", task.op);
        }

-        // The task has completed succesfully. Remove it from the in-progress list.
+        // The task has completed successfully. Remove it from the in-progress list.
        {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -338,7 +338,8 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
+#[async_trait::async_trait]
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -368,7 +369,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    /// is available. If this returns ValueReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -377,7 +378,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    ) -> Result<ValueReconstructResult>;

    /// Dump summary of the contents of the layer to stdout
-    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
+    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

 /// Returned by [`PersistentLayer::iter`]
@@ -442,6 +443,10 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
        None
    }

+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        None
+    }
+
    fn is_remote_layer(&self) -> bool {
        false
    }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{PageReadGuard, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -51,6 +51,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
+use std::sync::Arc;
 use tracing::*;

 use utils::{
@@ -222,9 +223,10 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

+#[async_trait::async_trait]
 impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
-    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
            self.desc.tenant_id,
@@ -299,7 +301,7 @@ impl Layer for DeltaLayer {
        Ok(())
    }

-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -414,6 +416,10 @@ impl AsLayerDesc for DeltaLayer {
 }

 impl PersistentLayer for DeltaLayer {
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        Some(self)
+    }
+
    fn local_path(&self) -> Option<PathBuf> {
        Some(self.path())
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;

 use utils::{
@@ -117,7 +117,7 @@ pub struct ImageLayer {

    access_stats: LayerAccessStats,

-    inner: RwLock<ImageLayerInner>,
+    inner: OnceCell<ImageLayerInner>,
 }

 impl std::fmt::Debug for ImageLayer {
@@ -134,30 +134,27 @@ impl std::fmt::Debug for ImageLayer {
 }

 pub struct ImageLayerInner {
-    /// If false, the 'index' has not been loaded into memory yet.
-    loaded: bool,
-
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file. (None if not loaded yet)
-    file: Option<FileBlockReader<VirtualFile>>,
+    /// Reader object for reading blocks from the file.
+    file: FileBlockReader<VirtualFile>,
 }

 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
-            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
    }
 }

+#[async_trait::async_trait]
 impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
-    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
            self.desc.tenant_id,
@@ -174,7 +171,7 @@ impl Layer for ImageLayer {
        }

        let inner = self.load(LayerAccessKind::Dump, ctx)?;
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

@@ -189,7 +186,7 @@ impl Layer for ImageLayer {
    }

    /// Look up given page in the file
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
@@ -202,7 +199,7 @@ impl Layer for ImageLayer {

        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -321,52 +318,26 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<RwLockReadGuard<ImageLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        loop {
-            // Quick exit if already loaded
-            let inner = self.inner.read().unwrap();
-            if inner.loaded {
+            if let Some(inner) = self.inner.get() {
                return Ok(inner);
            }
-
-            // Need to open the file and load the metadata. Upgrade our lock to
-            // a write lock. (Or rather, release and re-lock in write mode.)
-            drop(inner);
-            let mut inner = self.inner.write().unwrap();
-            if !inner.loaded {
-                self.load_inner(&mut inner).with_context(|| {
-                    format!("Failed to load image layer {}", self.path().display())
-                })?
-            } else {
-                // Another thread loaded it while we were not holding the lock.
-            }
-
-            // We now have the file open and loaded. There's no function to do
-            // that in the std library RwLock, so we have to release and re-lock
-            // in read mode. (To be precise, the lock guard was moved in the
-            // above call to `load_inner`, so it's already been released). And
-            // while we do that, another thread could unload again, so we have
-            // to re-check and retry if that happens.
-            drop(inner);
+            self.inner
+                .get_or_try_init(|| self.load_inner())
+                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
        }
    }

-    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
+    fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

        // Open the file if it's not open already.
-        if inner.file.is_none() {
-            let file = VirtualFile::open(&path)
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            inner.file = Some(FileBlockReader::new(file));
-        }
-        let file = inner.file.as_mut().unwrap();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -394,10 +365,11 @@ impl ImageLayer {
            }
        }

-        inner.index_start_blk = actual_summary.index_start_blk;
-        inner.index_root_blk = actual_summary.index_root_blk;
-        inner.loaded = true;
-        Ok(())
+        Ok(ImageLayerInner {
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+            file,
+        })
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -421,12 +393,7 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            access_stats,
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
        }
    }

@@ -453,12 +420,7 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                file: None,
-                loaded: false,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
        })
    }

@@ -619,12 +581,7 @@ impl ImageLayerWriterInner {
            desc,
            lsn: self.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk,
-                index_root_blk,
-            }),
+            inner: OnceCell::new(),
        };

        // fsync the file
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,7 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::{Key, Value};
-use crate::tenant::blob_io::{BlobCursor, BlobWriter};
+use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
@@ -110,6 +110,7 @@ impl InMemoryLayer {
    }
 }

+#[async_trait::async_trait]
 impl Layer for InMemoryLayer {
    fn get_key_range(&self) -> Range<Key> {
        Key::MIN..Key::MAX
@@ -132,7 +133,7 @@ impl Layer for InMemoryLayer {
    }

    /// debugging function to print out the contents of the layer
-    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();

        let end_str = inner
@@ -183,7 +184,7 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -65,8 +65,9 @@ impl std::fmt::Debug for RemoteLayer {
    }
 }

+#[async_trait::async_trait]
 impl Layer for RemoteLayer {
-    fn get_value_reconstruct_data(
+    async fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
@@ -77,7 +78,7 @@ impl Layer for RemoteLayer {
    }

    /// debugging function to print out the contents of the layer
-    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
            self.desc.tenant_id,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -111,7 +111,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                Duration::from_secs(10)
            } else {
                // Run compaction
-                if let Err(e) = tenant.compaction_iteration(&ctx).await {
+                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
                    wait_duration
                } else {
@@ -122,12 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            warn_when_period_overrun(started_at.elapsed(), period, "compaction");

            // Sleep
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    info!("received cancellation request during idling");
-                    break;
-                },
-                _ = tokio::time::sleep(sleep_duration) => {},
+            if tokio::time::timeout(sleep_duration, cancel.cancelled())
+                .await
+                .is_ok()
+            {
+                info!("received cancellation request during idling");
+                break;
            }
        }
    }
@@ -196,12 +196,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            warn_when_period_overrun(started_at.elapsed(), period, "gc");

            // Sleep
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    info!("received cancellation request during idling");
-                    break;
-                },
-                _ = tokio::time::sleep(sleep_duration) => {},
+            if tokio::time::timeout(sleep_duration, cancel.cancelled())
+                .await
+                .is_ok()
+            {
+                info!("received cancellation request during idling");
+                break;
            }
        }
    }
@@ -263,9 +263,9 @@ pub(crate) async fn random_init_delay(
        rng.gen_range(Duration::ZERO..=period)
    };

-    tokio::select! {
-        _ = cancel.cancelled() => Err(Cancelled),
-        _ = tokio::time::sleep(d) => Ok(()),
+    match tokio::time::timeout(d, cancel.cancelled()).await {
+        Ok(_) => Err(Cancelled),
+        Err(_) => Ok(()),
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -24,7 +24,7 @@ use tracing::*;
 use utils::id::TenantTimelineId;

 use std::cmp::{max, min, Ordering};
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
@@ -86,6 +86,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
+use super::delete::DeleteTimelineFlow;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
@@ -237,11 +238,10 @@ pub struct Timeline {

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
-    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
+    /// This is an `Arc<Mutex>` lock because we need an owned
    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    ///
-    /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
+    /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,

    // Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -283,7 +283,7 @@ pub struct Timeline {

    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
+    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

@@ -334,7 +334,7 @@ pub struct GcInfo {
 #[derive(thiserror::Error)]
 pub enum PageReconstructError {
    #[error(transparent)]
-    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
+    Other(#[from] anyhow::Error),

    /// The operation would require downloading a layer that is missing locally.
    NeedsDownload(TenantTimelineId, LayerFileName),
@@ -475,7 +475,7 @@ impl Timeline {
            img: cached_page_img,
        };

-        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();
@@ -555,7 +555,7 @@ impl Timeline {
            "wait_lsn cannot be called in WAL receiver"
        );

-        let _timer = self.metrics.wait_lsn_time_histo.start_timer();
+        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();

        match self
            .last_record_lsn
@@ -611,9 +611,46 @@ impl Timeline {
    }

    /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn compact(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

+        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+            once_cell::sync::Lazy::new(|| {
+                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+                let permits = usize::max(
+                    1,
+                    // while a lot of the work is done on spawn_blocking, we still do
+                    // repartitioning in the async context. this should give leave us some workers
+                    // unblocked to be blocked on other work, hopefully easing any outside visible
+                    // effects of restarts.
+                    //
+                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
+                    // spawn_blocking.
+                    (total_threads * 3).checked_div(4).unwrap_or(0),
+                );
+                assert_ne!(permits, 0, "we will not be adding in permits later");
+                assert!(
+                    permits < total_threads,
+                    "need threads avail for shorter work"
+                );
+                tokio::sync::Semaphore::new(permits)
+            });
+
+        // this wait probably never needs any "long time spent" logging, because we already nag if
+        // compaction task goes over it's period (20s) which is quite often in production.
+        let _permit = tokio::select! {
+            permit = CONCURRENT_COMPACTIONS.acquire() => {
+                permit
+            },
+            _ = cancel.cancelled() => {
+                return Ok(());
+            }
+        };
+
        let last_record_lsn = self.get_last_record_lsn();

        // Last record Lsn could be zero in case the timeline was just created
@@ -671,11 +708,9 @@ impl Timeline {

            let mut failed = 0;

-            let mut cancelled = pin!(task_mgr::shutdown_watcher());
-
            loop {
                tokio::select! {
-                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
+                    _ = cancel.cancelled() => anyhow::bail!("Cancelled while downloading remote layers"),
                    res = downloads.next() => {
                        match res {
                            Some(Ok(())) => {},
@@ -890,7 +925,7 @@ impl Timeline {
                    new_state,
                    TimelineState::Stopping | TimelineState::Broken { .. }
                ) {
-                    // drop the copmletion guard, if any; it might be holding off the completion
+                    // drop the completion guard, if any; it might be holding off the completion
                    // forever needlessly
                    self.initial_logical_size_attempt
                        .lock()
@@ -1011,11 +1046,11 @@ impl Timeline {
            .evict_layer_batch(remote_client, &[local_layer], cancel)
            .await?;
        assert_eq!(results.len(), 1);
-        let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
+        let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
        match result {
            None => anyhow::bail!("task_mgr shutdown requested"),
-            Some(Ok(b)) => Ok(Some(b)),
-            Some(Err(e)) => Err(e),
+            Some(Ok(())) => Ok(Some(true)),
+            Some(Err(e)) => Err(anyhow::Error::new(e)),
        }
    }

@@ -1024,12 +1059,12 @@ impl Timeline {
    /// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured."
    ///
    /// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
-    pub async fn evict_layers(
+    pub(crate) async fn evict_layers(
        &self,
        _: &GenericRemoteStorage,
        layers_to_evict: &[Arc<dyn PersistentLayer>],
        cancel: CancellationToken,
-    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
+    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
        let remote_client = self.remote_client.clone().expect(
            "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
        );
@@ -1064,7 +1099,7 @@ impl Timeline {
        remote_client: &Arc<RemoteTimelineClient>,
        layers_to_evict: &[Arc<dyn PersistentLayer>],
        cancel: CancellationToken,
-    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
+    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
        // ensure that the layers have finished uploading
        // (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
        remote_client
@@ -1110,11 +1145,9 @@ impl Timeline {
        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
        local_layer: &Arc<dyn PersistentLayer>,
        layer_mgr: &mut LayerManager,
-    ) -> anyhow::Result<bool> {
+    ) -> Result<(), EvictionError> {
        if local_layer.is_remote_layer() {
-            // TODO(issue #3851): consider returning an err here instead of false,
-            // which is the same out the match later
-            return Ok(false);
+            return Err(EvictionError::CannotEvictRemoteLayer);
        }

        let layer_file_size = local_layer.file_size();
@@ -1123,13 +1156,22 @@ impl Timeline {
            .local_path()
            .expect("local layer should have a local path")
            .metadata()
-            .context("get local layer file stat")?
+            // when the eviction fails because we have already deleted the layer in compaction for
+            // example, a NotFound error bubbles up from here.
+            .map_err(|e| {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    EvictionError::FileNotFound
+                } else {
+                    EvictionError::StatFailed(e)
+                }
+            })?
            .modified()
-            .context("get mtime of layer file")?;
+            .map_err(EvictionError::StatFailed)?;
+
        let local_layer_residence_duration =
            match SystemTime::now().duration_since(local_layer_mtime) {
                Err(e) => {
-                    warn!("layer mtime is in the future: {}", e);
+                    warn!(layer = %local_layer, "layer mtime is in the future: {}", e);
                    None
                }
                Ok(delta) => Some(delta),
@@ -1160,54 +1202,65 @@ impl Timeline {

        assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());

-        let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) {
-            Ok(()) => {
-                if let Err(e) = local_layer.delete_resident_layer_file() {
-                    error!("failed to remove layer file on evict after replacement: {e:#?}");
-                }
-                // Always decrement the physical size gauge, even if we failed to delete the file.
-                // Rationale: we already replaced the layer with a remote layer in the layer map,
-                // and any subsequent download_remote_layer will
-                // 1. overwrite the file on disk and
-                // 2. add the downloaded size to the resident size gauge.
-                //
-                // If there is no re-download, and we restart the pageserver, then load_layer_map
-                // will treat the file as a local layer again, count it towards resident size,
-                // and it'll be like the layer removal never happened.
-                // The bump in resident size is perhaps unexpected but overall a robust behavior.
-                self.metrics
-                    .resident_physical_size_gauge
-                    .sub(layer_file_size);
+        layer_mgr
+            .replace_and_verify(local_layer.clone(), new_remote_layer)
+            .map_err(EvictionError::LayerNotFound)?;

-                self.metrics.evictions.inc();
+        if let Err(e) = local_layer.delete_resident_layer_file() {
+            // this should never happen, because of layer_removal_cs usage and above stat
+            // access for mtime
+            error!("failed to remove layer file on evict after replacement: {e:#?}");
+        }
+        // Always decrement the physical size gauge, even if we failed to delete the file.
+        // Rationale: we already replaced the layer with a remote layer in the layer map,
+        // and any subsequent download_remote_layer will
+        // 1. overwrite the file on disk and
+        // 2. add the downloaded size to the resident size gauge.
+        //
+        // If there is no re-download, and we restart the pageserver, then load_layer_map
+        // will treat the file as a local layer again, count it towards resident size,
+        // and it'll be like the layer removal never happened.
+        // The bump in resident size is perhaps unexpected but overall a robust behavior.
+        self.metrics
+            .resident_physical_size_gauge
+            .sub(layer_file_size);

-                if let Some(delta) = local_layer_residence_duration {
-                    self.metrics
-                        .evictions_with_low_residence_duration
-                        .read()
-                        .unwrap()
-                        .observe(delta);
-                    info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
-                } else {
-                    info!(layer=%local_layer, "evicted layer after unknown residence period");
-                }
+        self.metrics.evictions.inc();

-                true
-            }
-            Err(err) => {
-                if cfg!(debug_assertions) {
-                    panic!("failed to replace: {err}, evicted: {local_layer:?}");
-                } else {
-                    error!(evicted=?local_layer, "failed to replace: {err}");
-                }
-                false
-            }
-        };
+        if let Some(delta) = local_layer_residence_duration {
+            self.metrics
+                .evictions_with_low_residence_duration
+                .read()
+                .unwrap()
+                .observe(delta);
+            info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
+        } else {
+            info!(layer=%local_layer, "evicted layer after unknown residence period");
+        }

-        Ok(succeed)
+        Ok(())
    }
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum EvictionError {
+    #[error("cannot evict a remote layer")]
+    CannotEvictRemoteLayer,
+    /// Most likely the to-be evicted layer has been deleted by compaction or gc which use the same
+    /// locks, so they got to execute before the eviction.
+    #[error("file backing the layer has been removed already")]
+    FileNotFound,
+    #[error("stat failed")]
+    StatFailed(#[source] std::io::Error),
+    /// In practice, this can be a number of things, but lets assume it means only this.
+    ///
+    /// This case includes situations such as the Layer was evicted and redownloaded in between,
+    /// because the file existed before an replacement attempt was made but now the Layers are
+    /// different objects in memory.
+    #[error("layer was no longer part of LayerMap")]
+    LayerNotFound(#[source] anyhow::Error),
+}
+
 /// Number of times we will compute partition within a checkpoint distance.
 const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;

@@ -1307,9 +1360,10 @@ impl Timeline {
        pg_version: u32,
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
+        state: TimelineState,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Loading);
+        let (state, _) = watch::channel(state);

        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -1400,7 +1454,7 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
-                delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
+                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),

                initial_logical_size_can_start,
                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -1865,6 +1919,15 @@ impl Timeline {
    }

    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
+        let state = self.current_state();
+        if matches!(
+            state,
+            TimelineState::Broken { .. } | TimelineState::Stopping
+        ) {
+            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
+            return;
+        }
+
        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
            .try_acquire_owned()
        {
@@ -2234,8 +2297,9 @@ impl Timeline {
        let mut timeline_owned;
        let mut timeline = self;

-        let mut read_count =
-            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
+        let mut read_count = scopeguard::guard(0, |cnt| {
+            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+        });

        // For debugging purposes, collect the path of layers that we traversed
        // through. It's included in the error message if we fail to find the key.
@@ -2369,12 +2433,15 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match open_layer.get_value_reconstruct_data(
-                                key,
-                                lsn_floor..cont_lsn,
-                                reconstruct_state,
-                                ctx,
-                            ) {
+                            result = match open_layer
+                                .get_value_reconstruct_data(
+                                    key,
+                                    lsn_floor..cont_lsn,
+                                    reconstruct_state,
+                                    ctx,
+                                )
+                                .await
+                            {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2396,12 +2463,15 @@ impl Timeline {
                        if cont_lsn > start_lsn {
                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match frozen_layer.get_value_reconstruct_data(
-                                key,
-                                lsn_floor..cont_lsn,
-                                reconstruct_state,
-                                ctx,
-                            ) {
+                            result = match frozen_layer
+                                .get_value_reconstruct_data(
+                                    key,
+                                    lsn_floor..cont_lsn,
+                                    reconstruct_state,
+                                    ctx,
+                                )
+                                .await
+                            {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2432,12 +2502,15 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                            result = match layer.get_value_reconstruct_data(
-                                key,
-                                lsn_floor..cont_lsn,
-                                reconstruct_state,
-                                ctx,
-                            ) {
+                            result = match layer
+                                .get_value_reconstruct_data(
+                                    key,
+                                    lsn_floor..cont_lsn,
+                                    reconstruct_state,
+                                    ctx,
+                                )
+                                .await
+                            {
                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
@@ -2685,7 +2758,7 @@ impl Timeline {
        // files instead. This is possible as long as *all* the data imported into the
        // repository have the same LSN.
        let lsn_range = frozen_layer.get_lsn_range();
-        let layer_paths_to_upload =
+        let (layer_paths_to_upload, delta_layer_to_add) =
            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
                #[cfg(test)]
                match &mut *self.flush_loop_state.lock().unwrap() {
@@ -2704,8 +2777,12 @@ impl Timeline {
                let (partitioning, _lsn) = self
                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                    .await?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
-                    .await?
+                // For image layers, we add them immediately into the layer map.
+                (
+                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
+                        .await?,
+                    None,
+                )
            } else {
                #[cfg(test)]
                match &mut *self.flush_loop_state.lock().unwrap() {
@@ -2719,35 +2796,50 @@ impl Timeline {
                        assert!(!*expect_initdb_optimization, "expected initdb optimization");
                    }
                }
-                // normal case, write out a L0 delta layer file.
-                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer).await?;
-                HashMap::from([(delta_path, metadata)])
+                // Normal case, write out a L0 delta layer file.
+                // `create_delta_layer` will not modify the layer map.
+                // We will remove frozen layer and add delta layer in one atomic operation later.
+                let layer = self.create_delta_layer(&frozen_layer).await?;
+                (
+                    HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
+                    Some(layer),
+                )
            };

-        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
-        // a compaction can delete the file and then it won't be available for uploads any more.
-        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
-        // race situation.
-        // See https://github.com/neondatabase/neon/issues/4526
-
-        pausable_failpoint!("flush-frozen-before-sync");
-
        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now. The flushed layer is stored in
        // the mapping in `create_delta_layer`.
        {
            let mut guard = self.layers.write().await;
-            let l = guard.layer_map_mut().frozen_layers.pop_front();

-            // Only one thread may call this function at a time (for this
-            // timeline). If two threads tried to flush the same frozen
-            // layer to disk at the same time, that would not work.
-            assert!(compare_arced_layers(&l.unwrap(), &frozen_layer));
+            if let Some(ref l) = delta_layer_to_add {
+                // TODO: move access stats, metrics update, etc. into layer manager.
+                l.access_stats().record_residence_event(
+                    &guard,
+                    LayerResidenceStatus::Resident,
+                    LayerResidenceEventReason::LayerCreate,
+                );

+                // update metrics
+                let sz = l.file_size();
+                self.metrics.resident_physical_size_gauge.add(sz);
+                self.metrics.num_persistent_files_created.inc_by(1);
+                self.metrics.persistent_bytes_written.inc_by(sz);
+            }
+
+            guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
            // release lock on 'layers'
        }

-        fail_point!("checkpoint-after-sync");
+        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
+        // a compaction can delete the file and then it won't be available for uploads any more.
+        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
+        // race situation.
+        // See https://github.com/neondatabase/neon/issues/4526
+        pausable_failpoint!("flush-frozen-pausable");
+
+        // This failpoint is used by another test case `test_pageserver_recovery`.
+        fail_point!("flush-frozen-exit");

        // Update the metadata file, with new 'disk_consistent_lsn'
        //
@@ -2829,11 +2921,12 @@ impl Timeline {
        Ok(())
    }

-    // Write out the given frozen in-memory layer as a new L0 delta file
+    // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
+    // in layer map immediately. The caller is responsible to put it into the layer map.
    async fn create_delta_layer(
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
-    ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
+    ) -> anyhow::Result<DeltaLayer> {
        let span = tracing::info_span!("blocking");
        let new_delta: DeltaLayer = tokio::task::spawn_blocking({
            let _g = span.entered();
@@ -2870,25 +2963,8 @@ impl Timeline {
        })
        .await
        .context("spawn_blocking")??;
-        let new_delta_name = new_delta.filename();
-        let sz = new_delta.desc.file_size;

-        // Add it to the layer map
-        let l = Arc::new(new_delta);
-        let mut guard = self.layers.write().await;
-        l.access_stats().record_residence_event(
-            &guard,
-            LayerResidenceStatus::Resident,
-            LayerResidenceEventReason::LayerCreate,
-        );
-        guard.track_new_l0_delta_layer(l);
-
-        // update metrics
-        self.metrics.resident_physical_size_gauge.add(sz);
-        self.metrics.num_persistent_files_created.inc_by(1);
-        self.metrics.persistent_bytes_written.inc_by(sz);
-
-        Ok((new_delta_name, LayerFileMetadata::new(sz)))
+        Ok(new_delta)
    }

    async fn repartition(
@@ -3140,7 +3216,7 @@ impl Timeline {

 #[derive(Default)]
 struct CompactLevel0Phase1Result {
-    new_layers: Vec<DeltaLayer>,
+    new_layers: Vec<Arc<DeltaLayer>>,
    deltas_to_compact: Vec<Arc<PersistentLayerDesc>>,
 }

@@ -3318,6 +3394,37 @@ impl Timeline {
            return Ok(CompactLevel0Phase1Result::default());
        }

+        // This failpoint is used together with `test_duplicate_layers` integration test.
+        // It returns the compaction result exactly the same layers as input to compaction.
+        // We want to ensure that this will not cause any problem when updating the layer map
+        // after the compaction is finished.
+        //
+        // Currently, there are two rare edge cases that will cause duplicated layers being
+        // inserted.
+        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
+        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
+        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
+        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
+        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
+        //    layer replace instead of the normal remove / upload process.
+        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
+        //    size length. Compaction will likely create the same set of n files afterwards.
+        //
+        // This failpoint is a superset of both of the cases.
+        fail_point!("compact-level0-phase1-return-same", |_| {
+            println!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
+            Ok(CompactLevel0Phase1Result {
+                new_layers: level0_deltas
+                    .iter()
+                    .map(|x| x.clone().downcast_delta_layer().unwrap())
+                    .collect(),
+                deltas_to_compact: level0_deltas
+                    .iter()
+                    .map(|x| x.layer_desc().clone().into())
+                    .collect(),
+            })
+        });
+
        // Gather the files to compact in this iteration.
        //
        // Start with the oldest Level 0 delta file, and collect any other
@@ -3400,7 +3507,7 @@ impl Timeline {
        let mut prev: Option<Key> = None;
        for (next_key, _next_lsn, _size) in itertools::process_results(
            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
-            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
+            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
        )? {
            if let Some(prev_key) = prev {
                // just first fast filter
@@ -3440,11 +3547,7 @@ impl Timeline {
                iter_iter.kmerge_by(|a, b| {
                    if let Ok((a_key, a_lsn, _)) = a {
                        if let Ok((b_key, b_lsn, _)) = b {
-                            match a_key.cmp(b_key) {
-                                Ordering::Less => true,
-                                Ordering::Equal => a_lsn <= b_lsn,
-                                Ordering::Greater => false,
-                            }
+                            (a_key, a_lsn) < (b_key, b_lsn)
                        } else {
                            false
                        }
@@ -3462,11 +3565,7 @@ impl Timeline {
                iter_iter.kmerge_by(|a, b| {
                    let (a_key, a_lsn, _) = a;
                    let (b_key, b_lsn, _) = b;
-                    match a_key.cmp(b_key) {
-                        Ordering::Less => true,
-                        Ordering::Equal => a_lsn <= b_lsn,
-                        Ordering::Greater => false,
-                    }
+                    (a_key, a_lsn) < (b_key, b_lsn)
                })
            },
        )?;
@@ -3576,7 +3675,9 @@ impl Timeline {
                        || contains_hole
                    {
                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
+                        new_layers.push(Arc::new(
+                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                        ));
                        writer = None;

                        if contains_hole {
@@ -3614,7 +3715,7 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(writer.finish(prev_key.unwrap().next())?);
+            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
        }

        // Sync layers
@@ -3723,6 +3824,11 @@ impl Timeline {
        let mut guard = self.layers.write().await;
        let mut new_layer_paths = HashMap::with_capacity(new_layers.len());

+        // In some rare cases, we may generate a file with exactly the same key range / LSN as before the compaction.
+        // We should move to numbering the layer files instead of naming them using key range / LSN some day. But for
+        // now, we just skip the file to avoid unintentional modification to files on the disk and in the layer map.
+        let mut duplicated_layers = HashSet::new();
+
        let mut insert_layers = Vec::new();
        let mut remove_layers = Vec::new();

@@ -3749,21 +3855,33 @@ impl Timeline {
                .add(metadata.len());

            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
-            let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            x.access_stats().record_residence_event(
+            l.access_stats().record_residence_event(
                &guard,
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
-            insert_layers.push(x);
+            let l = l as Arc<dyn PersistentLayer>;
+            if guard.contains(&l) {
+                duplicated_layers.insert(l.layer_desc().key());
+            } else {
+                if LayerMap::is_l0(l.layer_desc()) {
+                    return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
+                }
+                insert_layers.push(l);
+            }
        }

        // Now that we have reshuffled the data to set of new delta layers, we can
        // delete the old ones
        let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
-        for l in deltas_to_compact {
-            layer_names_to_delete.push(l.filename());
-            remove_layers.push(guard.get_from_desc(&l));
+        for ldesc in deltas_to_compact {
+            if duplicated_layers.contains(&ldesc.key()) {
+                // skip duplicated layers, they will not be removed; we have already overwritten them
+                // with new layers in the compaction phase 1.
+                continue;
+            }
+            layer_names_to_delete.push(ldesc.filename());
+            remove_layers.push(guard.get_from_desc(&ldesc));
        }

        guard.finish_compact_l0(
@@ -4522,6 +4640,7 @@ impl LocalLayerInfoForDiskUsageEviction {
 }

 impl Timeline {
+    /// Returns non-remote layers for eviction.
    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
        let guard = self.layers.read().await;
        let layers = guard.layer_map();
@@ -4691,3 +4810,179 @@ pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {

    left == right
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use utils::{id::TimelineId, lsn::Lsn};
+
+    use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};
+
+    use super::{EvictionError, Timeline};
+
+    #[tokio::test]
+    async fn two_layer_eviction_attempts_at_the_same_time() {
+        let harness =
+            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
+
+        let remote_storage = {
+            // this is never used for anything, because of how the create_test_timeline works, but
+            // it is with us in spirit and a Some.
+            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
+            let path = harness.conf.workdir.join("localfs");
+            std::fs::create_dir_all(&path).unwrap();
+            let config = RemoteStorageConfig {
+                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
+                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
+                storage: RemoteStorageKind::LocalFs(path),
+            };
+            GenericRemoteStorage::from_config(&config).unwrap()
+        };
+
+        let ctx = any_context();
+        let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let rc = timeline
+            .remote_client
+            .clone()
+            .expect("just configured this");
+
+        let layer = find_some_layer(&timeline).await;
+
+        let cancel = tokio_util::sync::CancellationToken::new();
+        let batch = [layer];
+
+        let first = {
+            let cancel = cancel.clone();
+            async {
+                timeline
+                    .evict_layer_batch(&rc, &batch, cancel)
+                    .await
+                    .unwrap()
+            }
+        };
+        let second = async {
+            timeline
+                .evict_layer_batch(&rc, &batch, cancel)
+                .await
+                .unwrap()
+        };
+
+        let (first, second) = tokio::join!(first, second);
+
+        let (first, second) = (only_one(first), only_one(second));
+
+        match (first, second) {
+            (Ok(()), Err(EvictionError::FileNotFound))
+            | (Err(EvictionError::FileNotFound), Ok(())) => {
+                // one of the evictions gets to do it,
+                // other one gets FileNotFound. all is good.
+            }
+            other => unreachable!("unexpected {:?}", other),
+        }
+    }
+
+    #[tokio::test]
+    async fn layer_eviction_aba_fails() {
+        let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
+
+        let remote_storage = {
+            // this is never used for anything, because of how the create_test_timeline works, but
+            // it is with us in spirit and a Some.
+            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
+            let path = harness.conf.workdir.join("localfs");
+            std::fs::create_dir_all(&path).unwrap();
+            let config = RemoteStorageConfig {
+                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
+                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
+                storage: RemoteStorageKind::LocalFs(path),
+            };
+            GenericRemoteStorage::from_config(&config).unwrap()
+        };
+
+        let ctx = any_context();
+        let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+            .await
+            .unwrap();
+
+        let _e = tracing::info_span!("foobar", tenant_id = %tenant.tenant_id, timeline_id = %timeline.timeline_id).entered();
+
+        let rc = timeline.remote_client.clone().unwrap();
+
+        // TenantHarness allows uploads to happen given GenericRemoteStorage is configured
+        let layer = find_some_layer(&timeline).await;
+
+        let cancel = tokio_util::sync::CancellationToken::new();
+        let batch = [layer];
+
+        let first = {
+            let cancel = cancel.clone();
+            async {
+                timeline
+                    .evict_layer_batch(&rc, &batch, cancel)
+                    .await
+                    .unwrap()
+            }
+        };
+
+        // lets imagine this is stuck somehow, still referencing the original `Arc<dyn PersistentLayer>`
+        let second = {
+            let cancel = cancel.clone();
+            async {
+                timeline
+                    .evict_layer_batch(&rc, &batch, cancel)
+                    .await
+                    .unwrap()
+            }
+        };
+
+        // while it's stuck, we evict and end up redownloading it
+        only_one(first.await).expect("eviction succeeded");
+
+        let layer = find_some_layer(&timeline).await;
+        let layer = layer.downcast_remote_layer().unwrap();
+        timeline.download_remote_layer(layer).await.unwrap();
+
+        let res = only_one(second.await);
+
+        assert!(
+            matches!(res, Err(EvictionError::LayerNotFound(_))),
+            "{res:?}"
+        );
+
+        // no more specific asserting, outside of preconds this is the only valid replacement
+        // failure
+    }
+
+    fn any_context() -> crate::context::RequestContext {
+        use crate::context::*;
+        use crate::task_mgr::*;
+        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
+    }
+
+    fn only_one<T>(mut input: Vec<Option<T>>) -> T {
+        assert_eq!(1, input.len());
+        input
+            .pop()
+            .expect("length just checked")
+            .expect("no cancellation")
+    }
+
+    async fn find_some_layer(timeline: &Timeline) -> Arc<dyn PersistentLayer> {
+        let layers = timeline.layers.read().await;
+        let desc = layers
+            .layer_map()
+            .iter_historic_layers()
+            .next()
+            .expect("must find one layer to evict");
+
+        layers.get_from_desc(&desc)
+    }
+}
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,6 +30,7 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
+        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
 };
@@ -100,11 +101,11 @@ impl Timeline {
            match cf {
                ControlFlow::Break(()) => break,
                ControlFlow::Continue(sleep_until) => {
-                    tokio::select! {
-                        _ = cancel.cancelled() => {
-                            break;
-                        }
-                        _ = tokio::time::sleep_until(sleep_until) => { }
+                    if tokio::time::timeout_at(sleep_until, cancel.cancelled())
+                        .await
+                        .is_ok()
+                    {
+                        break;
                    }
                }
            }
@@ -270,20 +271,22 @@ impl Timeline {
                None => {
                    stats.skipped_for_shutdown += 1;
                }
-                Some(Ok(true)) => {
-                    debug!("evicted layer {l:?}");
+                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Ok(false)) => {
-                    debug!("layer is not evictable: {l:?}");
+                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
                    stats.not_evictable += 1;
                }
-                Some(Err(e)) => {
-                    // This variant is the case where an unexpected error happened during eviction.
-                    // Expected errors that result in non-eviction are `Some(Ok(false))`.
-                    // So, dump Debug here to gather as much info as possible in this rare case.
-                    warn!("failed to evict layer {l:?}: {e:?}");
-                    stats.errors += 1;
+                Some(Err(EvictionError::FileNotFound)) => {
+                    // compaction/gc removed the file while we were waiting on layer_removal_cs
+                    stats.not_evictable += 1;
+                }
+                Some(Err(
+                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
+                )) => {
+                    let e = utils::error::report_compact_sources(&e);
+                    warn!(layer = %l, "failed to evict layer: {e}");
+                    stats.not_evictable += 1;
                }
            }
        }
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -194,10 +194,23 @@ impl LayerManager {
        updates.flush();
    }

-    /// Insert into the layer map when a new delta layer is created, called from `create_delta_layer`.
-    pub fn track_new_l0_delta_layer(&mut self, delta_layer: Arc<DeltaLayer>) {
+    /// Flush a frozen layer and add the written delta layer to the layer map.
+    pub fn finish_flush_l0_layer(
+        &mut self,
+        delta_layer: Option<DeltaLayer>,
+        frozen_layer_for_check: &Arc<InMemoryLayer>,
+    ) {
+        let l = self.layer_map.frozen_layers.pop_front();
        let mut updates = self.layer_map.batch_update();
-        Self::insert_historic_layer(delta_layer, &mut updates, &mut self.layer_fmgr);
+
+        // Only one thread may call this function at a time (for this
+        // timeline). If two threads tried to flush the same frozen
+        // layer to disk at the same time, that would not work.
+        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
+
+        if let Some(delta_layer) = delta_layer {
+            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
+        }
        updates.flush();
    }

@@ -295,6 +308,10 @@ impl LayerManager {

        Ok(())
    }
+
+    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
+        self.layer_fmgr.contains(layer)
+    }
 }

 pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
@@ -319,6 +336,10 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        }
    }

+    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
+        self.0.contains_key(&layer.layer_desc().key())
+    }
+
    pub(crate) fn new() -> Self {
        Self(HashMap::new())
    }
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,13 +2,9 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};

 use anyhow::Context;
 use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, id::TimelineId, lsn::Lsn};
+use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};

-use crate::{
-    context::RequestContext,
-    import_datadir,
-    tenant::{ignore_absent_files, Tenant},
-};
+use crate::{context::RequestContext, import_datadir, tenant::Tenant};

 use super::Timeline;

@@ -141,7 +137,7 @@ impl Drop for UninitializedTimeline<'_> {

 pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
    let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+    match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
        Ok(()) => {
            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
        }
@@ -185,7 +181,7 @@ impl TimelineUninitMark {
        let uninit_mark_parent = uninit_mark_file
            .parent()
            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
        })?;
        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1123,7 +1123,7 @@ mod tests {
    }

    #[tokio::test]
-    async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
+    async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
@@ -1189,8 +1189,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
+    async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();
@@ -1252,8 +1252,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
+    async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
        let mut state = dummy_state(&harness).await;
        let current_lsn = Lsn(100_000).align();
        let new_lsn = Lsn(100_100).align();
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -149,12 +149,10 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // We do not have information about tenant_id/timeline_id of evicted file.
-            // It is possible to store path together with file or use filepath crate,
-            // but as far as close() is not expected to be fast, it is not so critical to gather
-            // precise per-tenant statistic here.
+            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
+            // distinguish the two.
            STORAGE_IO_TIME
-                .with_label_values(&["close", "-", "-"])
+                .with_label_values(&["close-by-replace"])
                .observe_closure_duration(|| drop(old_file));
        }

@@ -208,7 +206,7 @@ impl VirtualFile {
        }
        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open", &tenant_id, &timeline_id])
+            .with_label_values(&["open"])
            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
@@ -271,7 +269,7 @@ impl VirtualFile {
                            // Found a cached file descriptor.
                            slot.recently_used.store(true, Ordering::Relaxed);
                            return Ok(STORAGE_IO_TIME
-                                .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
+                                .with_label_values(&[op])
                                .observe_closure_duration(|| func(file)));
                        }
                    }
@@ -298,12 +296,12 @@ impl VirtualFile {

        // Open the physical file
        let file = STORAGE_IO_TIME
-            .with_label_values(&["open", &self.tenant_id, &self.timeline_id])
+            .with_label_values(&["open"])
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
        let result = STORAGE_IO_TIME
-            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
+            .with_label_values(&[op])
            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
@@ -333,13 +331,11 @@ impl Drop for VirtualFile {
        let mut slot_guard = slot.inner.write().unwrap();
        if slot_guard.tag == handle.tag {
            slot.recently_used.store(false, Ordering::Relaxed);
-            // Unlike files evicted by replacement algorithm, here
-            // we group close time by tenant_id/timeline_id.
-            // At allows to compare number/time of "normal" file closes
-            // with file eviction.
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
            STORAGE_IO_TIME
-                .with_label_values(&["close", &self.tenant_id, &self.timeline_id])
-                .observe_closure_duration(|| slot_guard.file.take());
+                .with_label_values(&["close"])
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
        }
    }
 }
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -360,7 +360,6 @@ impl XlXactParsedRecord {
            }
        }
        let mut xnodes = Vec::<RelFileNode>::new();
-        // In v16 this XACT_XINFO_HAS_RELFILENODES is renamed to XACT_XINFO_HAS_RELFILELOCATORS
        if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
            let nrels = buf.get_i32_le();
            for _i in 0..nrels {
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -1,4 +1,4 @@
-comment = 'hnsw index'
+comment = '** Deprecated ** Please use pg_embedding instead'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -25,11 +25,7 @@
 #include "pagestore_client.h"
 #include "access/parallel.h"
 #include "postmaster/bgworker.h"
-#if PG_VERSION_NUM >= 160000
-#include "storage/relfilelocator.h"
-#else
 #include "storage/relfilenode.h"
-#endif
 #include "storage/buf_internals.h"
 #include "storage/latch.h"
 #include "storage/ipc.h"
@@ -43,7 +39,6 @@
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"

-
 /*
 * Local file cache is used to temporary store relations pages in local file system.
 * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -365,12 +360,9 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;

-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#else
-	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#endif
-
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -395,11 +387,7 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return;

-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#else
 	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#endif

 	hash = get_hash_value(lfc_hash, &tag);

@@ -469,12 +457,10 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,

 	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;
-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#else
-	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#endif

+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -540,12 +526,9 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return;

-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#else
-	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
-#endif
-
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -739,16 +722,9 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				if (entry->bitmap[i >> 5] & (1 << (i & 31)))
 				{
 					fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
-
-#if PG_VERSION_NUM >= 160000
-					fctx->record[n_pages].relfilenode =  entry->key.relNumber;
-					fctx->record[n_pages].reltablespace = entry->key.spcOid;
-					fctx->record[n_pages].reldatabase = entry->key.dbOid;
-#else
 					fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
 					fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
 					fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
-#endif
 					fctx->record[n_pages].forknum = entry->key.forkNum;
 					fctx->record[n_pages].blocknum = entry->key.blockNum + i;
 					fctx->record[n_pages].accesscount = entry->access_count;
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 	/*
 	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
 	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occured (> 0) if it was successful; that value is
+	 * done -2 if an error occurred (> 0) if it was successful; that value is
 	 * the amount transferred.
 	 *
 	 * The protocol we use between walproposer and safekeeper means that we
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 	/*
 	 * The docs for PQputcopyData list the return values as: 1 if the data was
 	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occured
+	 * error occurred
 	 */
 	result = PQputCopyData(conn->pg_conn, buf, size);

--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -16,11 +16,7 @@
 #include "postgres.h"

 #include "access/xlogdefs.h"
-#if PG_VERSION_NUM >= 160000
-#include "storage/relfilelocator.h"
-#else
 #include "storage/relfilenode.h"
-#endif
 #include "storage/block.h"
 #include "storage/smgr.h"
 #include "lib/stringinfo.h"
@@ -29,34 +25,6 @@

 #include "pg_config.h"

-// This is a hack to avoid too many ifdefs in the function definitions.
-#if PG_VERSION_NUM >= 160000
-typedef RelFileLocator RelFileNode;
-typedef RelFileLocatorBackend RelFileNodeBackend;
-#define RelFileNodeBackendIsTemp RelFileLocatorBackendIsTemp
-#endif
-
-#if PG_VERSION_NUM >= 160000
-#define RelnGetRnode(reln) (reln->smgr_rlocator.locator)
-#define RnodeGetSpcOid(rnode) (rnode.spcOid)
-#define RnodeGetDbOid(rnode) (rnode.dbOid)
-#define RnodeGetRelNumber(rnode) (rnode.relNumber)
-
-#define BufTagGetRnode(tag) (BufTagGetRelFileLocator(&tag))
-#else
-#define RelnGetRnode(reln) (reln->smgr_rnode.node)
-#define RnodeGetSpcOid(rnode) (rnode.spcNode)
-#define RnodeGetDbOid(rnode) (rnode.dbNode)
-#define RnodeGetRelNumber(rnode) (rnode.relNode)
-
-#define BufTagGetRnode(tag) (tag.rnode)
-
-#endif
-
-#define RelnGetSpcOid(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
-#define RelnGetDbOid(reln) (RnodeGetDbOid(RelnGetRnode(reln)))
-#define RelnGetRelNumber(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
-
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -117,7 +85,7 @@ typedef struct
 typedef struct
 {
 	NeonRequest req;
-	Oid			dbOid;
+	Oid			dbNode;
 }			NeonDbSizeRequest;

 typedef struct
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -58,11 +58,7 @@
 #include "postmaster/autovacuum.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
-#if PG_VERSION_NUM >= 160000
-#include "storage/relfilelocator.h"
-#else
 #include "storage/relfilenode.h"
-#endif
 #include "storage/buf_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
@@ -74,8 +70,6 @@
 #include "access/xlogrecovery.h"
 #endif

-
-
 /*
 * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
 * calls to md.c, and *also* do the calls to the Page Server. On every
@@ -92,10 +86,7 @@
 static char *hexdump_page(char *page);
 #endif

-
-#define IS_LOCAL_REL(reln) (RelnGetDbOid(reln) != 0 && RelnGetRelNumber(reln) > FirstNormalObjectId)
-
-
+#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)

 const int	SmgrTrace = DEBUG5;

@@ -193,13 +184,7 @@ typedef struct PrfHashEntry {
 	sizeof(BufferTag) \
 )

-
-#if PG_VERSION_NUM >= 160000
-#define SH_EQUAL(tb, a, b)	(BufferTagsEqual(&((a)->buftag),&((b)->buftag)))
-#else
 #define SH_EQUAL(tb, a, b)	(BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
-#endif
-
 #define SH_SCOPE			static inline
 #define SH_DEFINE
 #define SH_DECLARE
@@ -649,7 +634,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
 		.req.lsn = 0,
-		.rnode = BufTagGetRnode(slot->buftag),
+		.rnode = slot->buftag.rnode,
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};
@@ -664,7 +649,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	{
 		XLogRecPtr lsn = neon_get_request_lsn(
 			&request.req.latest,
-			BufTagGetRnode(slot->buftag),
+			slot->buftag.rnode,
 			slot->buftag.forkNum,
 			slot->buftag.blockNum
 		);
@@ -744,11 +729,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 		Assert(slot->status != PRFS_UNUSED);
 		Assert(MyPState->ring_last <= ring_index &&
 			   ring_index < MyPState->ring_unused);
-#if PG_VERSION_NUM >= 160000
-		Assert(BufferTagsEqual(&slot->buftag, &tag));
-#else
 		Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
-#endif
+
 		/*
 		 * If we want a specific lsn, we do not accept requests that were made
 		 * with a potentially different LSN.
@@ -911,9 +893,9 @@ nm_pack_request(NeonRequest * msg)

 				pq_sendbyte(&s, msg_req->req.latest);
 				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
-				pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
-				pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
 				pq_sendbyte(&s, msg_req->forknum);

 				break;
@@ -924,9 +906,9 @@ nm_pack_request(NeonRequest * msg)

 				pq_sendbyte(&s, msg_req->req.latest);
 				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
-				pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
-				pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
 				pq_sendbyte(&s, msg_req->forknum);

 				break;
@@ -937,7 +919,7 @@ nm_pack_request(NeonRequest * msg)

 				pq_sendbyte(&s, msg_req->req.latest);
 				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, msg_req->dbOid);
+				pq_sendint32(&s, msg_req->dbNode);

 				break;
 			}
@@ -947,9 +929,9 @@ nm_pack_request(NeonRequest * msg)

 				pq_sendbyte(&s, msg_req->req.latest);
 				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
-				pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
-				pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
 				pq_sendbyte(&s, msg_req->forknum);
 				pq_sendint32(&s, msg_req->blkno);

@@ -1082,9 +1064,9 @@ nm_to_string(NeonMessage * msg)

 				appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
 				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 RnodeGetSpcOid(msg_req->rnode),
-								 RnodeGetDbOid(msg_req->rnode),
-								 RnodeGetRelNumber(msg_req->rnode));
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
 				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
@@ -1098,9 +1080,9 @@ nm_to_string(NeonMessage * msg)

 				appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
 				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 RnodeGetSpcOid(msg_req->rnode),
-								 RnodeGetDbOid(msg_req->rnode),
-								 RnodeGetRelNumber(msg_req->rnode));
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
 				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
@@ -1114,9 +1096,9 @@ nm_to_string(NeonMessage * msg)

 				appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
 				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 RnodeGetSpcOid(msg_req->rnode),
-								 RnodeGetDbOid(msg_req->rnode),
-								 RnodeGetRelNumber(msg_req->rnode));
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
@@ -1129,7 +1111,7 @@ nm_to_string(NeonMessage * msg)
 				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;

 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
-				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbOid);
+				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
 				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
 				appendStringInfoChar(&s, '}');
@@ -1231,7 +1213,6 @@ static void
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
 {
 	XLogRecPtr	lsn = PageGetLSN(buffer);
-	RelFileNode rnode = RelnGetRnode(reln);

 	if (ShutdownRequestPending)
 		return;
@@ -1251,16 +1232,15 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;

-
-		recptr = log_newpage_copy(&rnode, forknum, blocknum, buffer, false);
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
 		ereport(SmgrTrace,
 				(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
 						blocknum,
-						RelnGetSpcOid(reln),
-						RelnGetDbOid(reln),
-						RelnGetRelNumber(reln),
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
 	else if (lsn == InvalidXLogRecPtr)
@@ -1288,9 +1268,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 			ereport(SmgrTrace,
 					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
 							blocknum,
-							RelnGetSpcOid(reln),
-							RelnGetDbOid(reln),
-							RelnGetRelNumber(reln),
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
 							forknum)));
 		}
 		else if (PageIsEmptyHeapPage(buffer))
@@ -1298,9 +1278,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 			ereport(SmgrTrace,
 					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
 							blocknum,
-							RelnGetSpcOid(reln),
-							RelnGetDbOid(reln),
-							RelnGetRelNumber(reln),
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
 							forknum)));
 		}
 		else
@@ -1308,9 +1288,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 			ereport(PANIC,
 					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
-							RelnGetSpcOid(reln),
-							RelnGetDbOid(reln),
-							RelnGetRelNumber(reln),
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
 							forknum)));
 		}
 	}
@@ -1319,9 +1299,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 		ereport(SmgrTrace,
 				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
 						blocknum,
-						RelnGetSpcOid(reln),
-						RelnGetDbOid(reln),
-						RelnGetRelNumber(reln),
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}

@@ -1329,7 +1309,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenLSNForBlock(lsn, rnode, forknum, blocknum);
+	SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum);
 }

 /*
@@ -1479,7 +1459,6 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
-	RelFileNode rnode = RelnGetRnode(reln);

 	switch (reln->smgr_relpersistence)
 	{
@@ -1506,7 +1485,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(RelnGetRnode(reln), forkNum, &n_blocks))
+	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
 	{
 		return true;
 	}
@@ -1521,20 +1500,20 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	 *
 	 * For now, handle that special case here.
 	 */
-	if (RelnGetSpcOid(reln) == 0 &&
-		RelnGetDbOid(reln) == 0 &&
-		RelnGetRelNumber(reln) == 0)
+	if (reln->smgr_rnode.node.spcNode == 0 &&
+		reln->smgr_rnode.node.dbNode == 0 &&
+		reln->smgr_rnode.node.relNode == 0)
 	{
 		return false;
 	}

-	request_lsn = neon_get_request_lsn(&latest, rnode, forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
 			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.rnode = rnode,
+			.rnode = reln->smgr_rnode.node,
 		.forknum = forkNum};

 		resp = page_server_request(&request);
@@ -1550,9 +1529,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							RelnGetSpcOid(reln),
-							RelnGetDbOid(reln),
-							RelnGetRelNumber(reln),
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -1574,8 +1553,6 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 void
 neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
-	RelFileNode rnode = RelnGetRnode(reln);
-
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
@@ -1594,8 +1571,9 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	}

 	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
-		 RelnGetSpcOid(reln),
-		 RelnGetDbOid(reln), RelnGetRelNumber(reln),
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
 		 forkNum);

 	/*
@@ -1619,12 +1597,12 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 */
 	if (isRedo)
 	{
-		update_cached_relsize(rnode, forkNum, 0);
-		get_cached_relsize(rnode, forkNum,
+		update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+		get_cached_relsize(reln->smgr_rnode.node, forkNum,
 						   &reln->smgr_cached_nblocks[forkNum]);
 	}
 	else
-		set_cached_relsize(rnode, forkNum, 0);
+		set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1661,12 +1639,7 @@ neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	mdunlink(rnode, forkNum, isRedo);
 	if (!RelFileNodeBackendIsTemp(rnode))
 	{
-
-#if PG_VERSION_NUM >= 160000
-		forget_cached_relsize(rnode.locator, forkNum);
-#else
 		forget_cached_relsize(rnode.node, forkNum);
-#endif
 	}
 }

@@ -1685,7 +1658,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 {
 	XLogRecPtr	lsn;
 	BlockNumber	n_blocks = 0;
-	RelFileNode rnode = RelnGetRnode(reln);

 	switch (reln->smgr_relpersistence)
 	{
@@ -1735,16 +1707,17 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);

 	neon_wallog_page(reln, forkNum, blkno, buffer, false);
-	set_cached_relsize(rnode, forkNum, blkno + 1);
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);

 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 RelnGetSpcOid(reln),
-		 RelnGetDbOid(reln), RelnGetRelNumber(reln),
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(rnode, forkNum, blkno, buffer);
+	lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1759,9 +1732,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (lsn == InvalidXLogRecPtr)
 	{
 		lsn = GetXLogInsertRecPtr();
-		SetLastWrittenLSNForBlock(lsn, rnode, forkNum, blkno);
+		SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno);
 	}
-	SetLastWrittenLSNForRelation(lsn, rnode, forkNum);
+	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum);
 }

 /*
@@ -1805,8 +1778,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	BufferTag	tag;
 	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;

-	RelFileNode rnode = RelnGetRnode(reln);
-
 	switch (reln->smgr_relpersistence)
 	{
 		case 0: /* probably shouldn't happen, but ignore it */
@@ -1821,18 +1792,15 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (lfc_cache_contains(rnode, forknum, blocknum))
+	if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum))
 		return false;

-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&tag, &rnode, forknum, blocknum);
-#else
 	tag = (BufferTag) {
-		.rnode = rnode,
+		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
 		.blockNum = blocknum
 	};
-#endif
+
 	ring_index = prefetch_register_buffer(tag, NULL, NULL);

 	Assert(ring_index < MyPState->ring_unused &&
@@ -1893,15 +1861,11 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;

-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&buftag, &rnode, forkNum, blkno);
-#else
 	buftag = (BufferTag) {
 		.rnode = rnode,
 		.forkNum = forkNum,
-		.blockNum = blkno
+		.blockNum = blkno,
 	};
-#endif

 	/*
 	 * The redo process does not lock pages that it needs to replay but are
@@ -2001,9 +1965,9 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							blkno,
-							RnodeGetSpcOid(rnode),
-							RnodeGetDbOid(rnode),
-							RnodeGetRelNumber(rnode),
+							rnode.spcNode,
+							rnode.dbNode,
+							rnode.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -2027,7 +1991,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 {
 	bool		latest;
 	XLogRecPtr	request_lsn;
-	RelFileNode rnode = RelnGetRnode(reln);

 	switch (reln->smgr_relpersistence)
 	{
@@ -2047,13 +2010,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}

 	/* Try to read from local file cache */
-	if (lfc_read(RelnGetRnode(reln), forkNum, blkno, buffer))
+	if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
 	{
 		return;
 	}

-	request_lsn = neon_get_request_lsn(&latest, rnode, forkNum, blkno);
-	neon_read_at_lsn(rnode, forkNum, blkno, request_lsn, latest, buffer);
+	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno);
+	neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2073,9 +2036,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			{
 				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 					 blkno,
-					 RelnGetSpcOid(reln),
-					 RelnGetDbOid(reln),
-					 RelnGetRelNumber(reln),
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
 					 forkNum,
 					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 					 hexdump_page(buffer));
@@ -2085,9 +2048,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		{
 			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 				 blkno,
-				 RelnGetSpcOid(reln),
-				 RelnGetDbOid(reln),
-				 RelnGetRelNumber(reln),
+				 reln->smgr_rnode.node.spcNode,
+				 reln->smgr_rnode.node.dbNode,
+				 reln->smgr_rnode.node.relNode,
 				 forkNum,
 				 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 				 hexdump_page(mdbuf));
@@ -2102,9 +2065,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			{
 				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 					 blkno,
-					 RelnGetSpcOid(reln),
-					 RelnGetDbOid(reln),
-					 RelnGetRelNumber(reln),
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
 					 forkNum,
 					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 					 hexdump_page(mdbuf_masked),
@@ -2123,9 +2086,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 				{
 					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 						 blkno,
-						 RelnGetSpcOid(reln),
-						 RelnGetDbOid(reln),
-						 RelnGetRelNumber(reln),
+						 reln->smgr_rnode.node.spcNode,
+						 reln->smgr_rnode.node.dbNode,
+						 reln->smgr_rnode.node.relNode,
 						 forkNum,
 						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 						 hexdump_page(mdbuf_masked),
@@ -2170,7 +2133,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   char *buffer, bool skipFsync)
 {
 	XLogRecPtr	lsn;
-	RelFileNode rnode = RelnGetRnode(reln);
+
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
@@ -2207,12 +2170,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,

 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 RelnGetSpcOid(reln),
-		 RelnGetDbOid(reln), RelnGetRelNumber(reln),
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(rnode, forknum, blocknum, buffer);
+	lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2230,7 +2194,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
-	RelFileNode rnode = RelnGetRnode(reln);

 	switch (reln->smgr_relpersistence)
 	{
@@ -2249,23 +2212,23 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(RelnGetRnode(reln), forknum, &n_blocks))
+	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
 	{
 		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
-			 RelnGetSpcOid(reln),
-			 RelnGetDbOid(reln),
-			 RelnGetRelNumber(reln),
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
 			 forknum, n_blocks);
 		return n_blocks;
 	}

-	request_lsn = neon_get_request_lsn(&latest, rnode, forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
 			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.rnode = rnode,
+			.rnode = reln->smgr_rnode.node,
 			.forknum = forknum,
 		};

@@ -2282,9 +2245,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							RelnGetSpcOid(reln),
-							RelnGetDbOid(reln),
-							RelnGetRelNumber(reln),
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
 							forknum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -2294,11 +2257,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		default:
 			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
-	update_cached_relsize(rnode, forknum, n_blocks);
+	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);

 	elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-		 RelnGetSpcOid(reln),
-		 RelnGetDbOid(reln), RelnGetRelNumber(reln),
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
 		 forknum,
 		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 		 n_blocks);
@@ -2311,7 +2275,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 *	neon_db_size() -- Get the size of the database in bytes.
 */
 int64
-neon_dbsize(Oid dbOid)
+neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
@@ -2325,7 +2289,7 @@ neon_dbsize(Oid dbOid)
 			.req.tag = T_NeonDbSizeRequest,
 			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.dbOid = dbOid,
+			.dbNode = dbNode,
 		};

 		resp = page_server_request(&request);
@@ -2341,7 +2305,7 @@ neon_dbsize(Oid dbOid)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read db size of db %u from page server at lsn %X/%08X",
-							dbOid,
+							dbNode,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
@@ -2352,7 +2316,7 @@ neon_dbsize(Oid dbOid)
 	}

 	elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-		 dbOid,
+		 dbNode,
 		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 		 db_size);

@@ -2367,7 +2331,6 @@ void
 neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 	XLogRecPtr	lsn;
-	RelFileNode rnode = RelnGetRnode(reln);

 	switch (reln->smgr_relpersistence)
 	{
@@ -2387,7 +2350,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	set_cached_relsize(rnode, forknum, nblocks);
+	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);

 	/*
 	 * Truncating a relation drops all its buffers from the buffer cache
@@ -2415,7 +2378,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 * for the extended pages, so there's no harm in leaving behind obsolete
 	 * entries for the truncated chunks.
 	 */
-	SetLastWrittenLSNForRelation(lsn, rnode, forknum);
+	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2485,9 +2448,9 @@ neon_start_unlogged_build(SMgrRelation reln)

 	ereport(SmgrTrace,
 			(errmsg("starting unlogged build of relation %u/%u/%u",
-					RelnGetSpcOid(reln),
-					RelnGetDbOid(reln),
-					RelnGetRelNumber(reln))));
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));

 	switch (reln->smgr_relpersistence)
 	{
@@ -2537,9 +2500,9 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)

 	ereport(SmgrTrace,
 			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelnGetSpcOid(reln),
-					RelnGetDbOid(reln),
-					RelnGetRelNumber(reln))));
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
 		return;
@@ -2566,9 +2529,9 @@ neon_end_unlogged_build(SMgrRelation reln)

 	ereport(SmgrTrace,
 			(errmsg("ending unlogged build of relation %u/%u/%u",
-					RelnGetSpcOid(reln),
-					RelnGetDbOid(reln),
-					RelnGetRelNumber(reln))));
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
@@ -2581,24 +2544,16 @@ neon_end_unlogged_build(SMgrRelation reln)
 		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;

 		/* Remove local copy */
-#if PG_VERSION_NUM >= 160000
-		rnode.locator = RelnGetRnode(reln);
-#else
-		rnode.node = RelnGetRnode(reln);
-#endif
+		rnode = reln->smgr_rnode;
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
 			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
-				 RelnGetSpcOid(reln),
-				 RelnGetDbOid(reln),
-				 RelnGetRelNumber(reln),
+				 rnode.node.spcNode,
+				 rnode.node.dbNode,
+				 rnode.node.relNode,
 				 forknum);

-#if PG_VERSION_NUM >= 160000
-			forget_cached_relsize(rnode.locator, forknum);
-#else
 			forget_cached_relsize(rnode.node, forknum);
-#endif
 			mdclose(reln, forknum);
 			/* use isRedo == true, so that we drop it immediately */
 			mdunlink(rnode, forknum, true);
@@ -2751,16 +2706,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	 * regardless of whether the block is stored in shared buffers.
 	 * See also this function's top comment.
 	 */
-
-	if (!OidIsValid(RnodeGetDbOid(rnode)))
+	if (!OidIsValid(rnode.dbNode))
 		return false;

-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&tag, &rnode, forknum, blkno);
-#else
 	INIT_BUFFERTAG(tag, rnode, forknum, blkno);
-#endif
-
 	hash = BufTableHashCode(&tag);
 	partitionLock = BufMappingPartitionLock(hash);

--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -15,11 +15,7 @@
 #include "postgres.h"

 #include "pagestore_client.h"
-#if PG_VERSION_NUM >= 160000
-#include "storage/relfilelocator.h"
-#else
 #include "storage/relfilenode.h"
-#endif
 #include "storage/smgr.h"
 #include "storage/lwlock.h"
 #include "storage/ipc.h"
@@ -32,7 +28,6 @@
 #include "miscadmin.h"
 #endif

-
 typedef struct
 {
 	RelFileNode rnode;
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -788,7 +788,7 @@ ReconnectSafekeepers(void)

 /*
 * Performs the logic for advancing the state machine of the specified safekeeper,
- * given that a certain set of events has occured.
+ * given that a certain set of events has occurred.
 */
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
@@ -1394,12 +1394,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;

-#if PG_VERSION_NUM >= 160000
-	bool must_use_password = false;
-	wrconn = walrcv_connect(safekeeper[donor].conninfo, false, must_use_password, "wal_proposer_recovery", &err);
-#else
 	wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
-#endif
 	if (!wrconn)
 	{
 		ereport(WARNING,
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -23,7 +23,7 @@
 									 * message header */

 /*
- * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0
@@ -317,7 +317,7 @@ typedef struct AppendResponse
 	/* this is a criterion for walproposer --sync mode exit */
 	XLogRecPtr	commitLsn;
 	HotStandbyFeedback hs;
-	/* Feedback recieved from pageserver includes standby_status_update fields */
+	/* Feedback received from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -26,10 +26,6 @@
 #include "access/xlogrecovery.h"
 #endif

-#if PG_VERSION_NUM >= 160000
-#include "utils/guc.h"
-#endif
-
 /*
 * These variables are used similarly to openLogFile/SegNo,
 * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -128,11 +128,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 			else
 				isvalid = false;
 			bufferid = BufferDescriptorGetBuffer(bufHdr);
-#if PG_VERSION_NUM >= 160000
-			rnode = BufTagGetRelFileLocator(&bufHdr->tag);
-#else
 			rnode = bufHdr->tag.rnode;
-#endif
 			forknum = bufHdr->tag.forkNum;
 			blocknum = bufHdr->tag.blockNum;

@@ -242,7 +238,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);

-	neon_read_at_lsn(RelnGetRnode(RelationGetSmgr(rel)), forknum, blkno, read_lsn, request_latest, raw_page_data);
+	neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);

 	relation_close(rel, AccessShareLock);

@@ -271,17 +267,11 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();

 	{
-#if PG_VERSION_NUM >= 160000
-		RelFileLocator rnode = {
-			.spcOid = PG_GETARG_OID(0),
-			.dbOid = PG_GETARG_OID(1),
-		.relNumber = PG_GETARG_OID(2)};
-#else
 		RelFileNode rnode = {
 			.spcNode = PG_GETARG_OID(0),
 			.dbNode = PG_GETARG_OID(1),
 		.relNode = PG_GETARG_OID(2)};
-#endif
+
 		ForkNumber	forknum = PG_GETARG_UINT32(3);

 		uint32		blkno = PG_GETARG_UINT32(4);
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -21,6 +21,7 @@
 #include "access/xlog.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"
+#include "storage/relfilenode.h"
 #include "storage/smgr.h"

 #if PG_VERSION_NUM >= 150000
@@ -29,7 +30,6 @@

 #include "inmem_smgr.h"

-
 /* Size of the in-memory smgr */
 #define MAX_PAGES 64

@@ -46,22 +46,12 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
 	/* We only hold a small number of pages, so linear search */
 	for (int i = 0; i < used_pages; i++)
 	{
-
-#if PG_VERSION_NUM >= 160000
-		if (BufTagMatchesRelFileLocator(&page_tag[i], &reln->smgr_rlocator.locator)
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
 			&& forknum == page_tag[i].forkNum
 			&& blkno == page_tag[i].blockNum)
 		{
 			return i;
 		}
-#else
-		if (RelFileNodeEquals(RelnGetRnode(reln), page_tag[i].rnode)
-			&& forknum == page_tag[i].forkNum
-			&& blkno == page_tag[i].blockNum)
-		{
-			return i;
-		}
-#endif
 	}
 	return -1;
 }
@@ -107,12 +97,8 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
 {
 	for (int i = 0; i < used_pages; i++)
 	{
-#if PG_VERSION_NUM >= 160000
-		if (BufTagMatchesRelFileLocator(&page_tag[i], &reln->smgr_rlocator.locator)
-#else
-		if (RelFileNodeEquals(RelnGetRnode(reln), page_tag[i].rnode)
-#endif			
-		&& forknum == page_tag[i].forkNum)
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum)
 		{
 			return true;
 		}
@@ -230,9 +216,9 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 */
 		elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
 			 "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
-			 RelnGetSpcOid(reln),
-			 RelnGetDbOid(reln),
-			 RelnGetRelNumber(reln),
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
 			 forknum,
 			 blocknum,
 			 used_pages);
@@ -241,19 +227,14 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,

 		pg = used_pages;
 		used_pages++;
-
-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&page_tag[pg], &RelnGetRnode(reln), forknum, blocknum);
-#else
-	INIT_BUFFERTAG(page_tag[pg], RelnGetRnode(reln), forknum, blocknum);
-#endif
+		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
 	}
 	else
 	{
 		elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
-			 RelnGetSpcOid(reln),
-			 RelnGetDbOid(reln),
-			 RelnGetRelNumber(reln),
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
 			 forknum,
 			 blocknum,
 			 used_pages);
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -11,40 +11,6 @@
 #ifndef INMEM_SMGR_H
 #define INMEM_SMGR_H

-#if PG_VERSION_NUM >= 160000
-#include "storage/relfilelocator.h"
-#else
-#include "storage/relfilenode.h"
-#endif
-
-// This is a hack to avoid too many ifdefs in the function definitions.
-#if PG_VERSION_NUM >= 160000
-typedef RelFileLocator RelFileNode;
-typedef RelFileLocatorBackend RelFileNodeBackend;
-#define RelFileNodeBackendIsTemp RelFileLocatorBackendIsTemp
-#endif
-
-#if PG_VERSION_NUM >= 160000
-#define RelnGetRnode(reln) (reln->smgr_rlocator.locator)
-#define RnodeGetSpcOid(rnode) (rnode.spcOid)
-#define RnodeGetDbOid(rnode) (rnode.dbOid)
-#define RnodeGetRelNumber(rnode) (rnode.relNumber)
-
-#define BufTagGetRnode(tag) (BufTagGetRelFileLocator(&tag))
-#else
-#define RelnGetRnode(reln) (reln->smgr_rnode.node)
-#define RnodeGetSpcOid(rnode) (rnode.spcNode)
-#define RnodeGetDbOid(rnode) (rnode.dbNode)
-#define RnodeGetRelNumber(rnode) (rnode.relNode)
-
-#define BufTagGetRnode(tag) (tag.rnode)
-
-#endif
-
-#define RelnGetSpcOid(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
-#define RelnGetDbOid(reln) (RnodeGetDbOid(RelnGetRnode(reln)))
-#define RelnGetRelNumber(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
-
 extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
 extern void smgr_init_inmem(void);

--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -62,10 +62,8 @@
 #endif

 #ifndef HAVE_GETRUSAGE
-#if PG_VERSION_NUM < 160000
 #include "rusagestub.h"
 #endif
-#endif

 #include "access/clog.h"
 #include "access/commit_ts.h"
@@ -119,7 +117,6 @@
 #include "neon_seccomp.h"
 #endif

-
 PG_MODULE_MAGIC;

 static int	ReadRedoCommand(StringInfo inBuf);
@@ -665,31 +662,18 @@ BeginRedoForBlock(StringInfo input_message)
 	 * BlockNumber
 	 */
 	forknum = pq_getmsgbyte(input_message);
-#if PG_VERSION_NUM >= 160000
-	rnode.spcOid = pq_getmsgint(input_message, 4);
-	rnode.dbOid = pq_getmsgint(input_message, 4);
-	rnode.relNumber = pq_getmsgint(input_message, 4);
-#else
 	rnode.spcNode = pq_getmsgint(input_message, 4);
 	rnode.dbNode = pq_getmsgint(input_message, 4);
 	rnode.relNode = pq_getmsgint(input_message, 4);
-#endif
 	blknum = pq_getmsgint(input_message, 4);
 	wal_redo_buffer = InvalidBuffer;

-#if PG_VERSION_NUM >= 160000
-	InitBufferTag(&target_redo_tag, &rnode, forknum, blknum);
-#else
 	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
-#endif
-

 	elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
-#if PG_VERSION_NUM >= 160000
-			target_redo_tag.spcOid, target_redo_tag.dbOid, target_redo_tag.relNumber,
-#else
-			target_redo_tag.rnode.spcNode, target_redo_tag.rnode.dbNode, target_redo_tag.rnode.relNode,
-#endif	
+		 target_redo_tag.rnode.spcNode,
+		 target_redo_tag.rnode.dbNode,
+		 target_redo_tag.rnode.relNode,
 		 target_redo_tag.forkNum,
 		 target_redo_tag.blockNum);

@@ -725,15 +709,9 @@ PushPage(StringInfo input_message)
 	 * 8k page content
 	 */
 	forknum = pq_getmsgbyte(input_message);
-#if PG_VERSION_NUM >= 160000
-	rnode.spcOid = pq_getmsgint(input_message, 4);
-	rnode.dbOid = pq_getmsgint(input_message, 4);
-	rnode.relNumber = pq_getmsgint(input_message, 4);
-#else
 	rnode.spcNode = pq_getmsgint(input_message, 4);
 	rnode.dbNode = pq_getmsgint(input_message, 4);
 	rnode.relNode = pq_getmsgint(input_message, 4);
-#endif
 	blknum = pq_getmsgint(input_message, 4);
 	content = pq_getmsgbytes(input_message, BLCKSZ);

@@ -853,12 +831,7 @@ ApplyRecord(StringInfo input_message)
 	 */
 	if (BufferIsInvalid(wal_redo_buffer))
 	{
-		wal_redo_buffer = NeonRedoReadBuffer(
-#if PG_VERSION_NUM >= 160000
-											 BufTagGetRelFileLocator(&target_redo_tag),
-#else
-											 target_redo_tag.rnode,
-#endif
+		wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode,
 											 target_redo_tag.forkNum,
 											 target_redo_tag.blockNum,
 											 RBM_NORMAL);
@@ -900,43 +873,12 @@ apply_error_callback(void *arg)
 }


-#if PG_VERSION_NUM >= 160000

 static bool
 redo_block_filter(XLogReaderState *record, uint8 block_id)
 {
 	BufferTag	target_tag;

-	RelFileLocator rlocator;
-	XLogRecGetBlockTag(record, block_id,
-					   &rlocator, &target_tag.forkNum, &target_tag.blockNum);
-
-	target_tag.spcOid = rlocator.spcOid;
-	target_tag.dbOid = rlocator.dbOid;
-	target_tag.relNumber = rlocator.relNumber;
-
-	/*
-	 * Can a WAL redo function ever access a relation other than the one that
-	 * it modifies? I don't see why it would.
-	 */
-	if (RelFileLocatorEquals(BufTagGetRelFileLocator(&target_tag), BufTagGetRelFileLocator(&target_redo_tag)))
-		elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
-			target_tag.spcOid, target_tag.dbOid, target_tag.relNumber,
-			target_tag.forkNum, target_tag.blockNum);
-
-	/*
-	 * If this block isn't one we are currently restoring, then return 'true'
-	 * so that this gets ignored
-	 */
-	return !BufferTagsEqual(&target_tag, &target_redo_tag);
-}
-#else
-static bool
-redo_block_filter(XLogReaderState *record, uint8 block_id)
-{
-	BufferTag	target_tag;
-
-
 #if PG_VERSION_NUM >= 150000
 	XLogRecGetBlockTag(record, block_id,
 					   &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum);
@@ -955,18 +897,14 @@ redo_block_filter(XLogReaderState *record, uint8 block_id)
 	 */
 	if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
 		elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
-			target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode,
-			target_tag.forkNum, target_tag.blockNum);
+			 target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);

 	/*
 	 * If this block isn't one we are currently restoring, then return 'true'
 	 * so that this gets ignored
 	 */
-
 	return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
 }
-#endif
-

 /*
 * Get a page image back from buffer cache.
@@ -993,15 +931,9 @@ GetPage(StringInfo input_message)
 	 * BlockNumber
 	 */
 	forknum = pq_getmsgbyte(input_message);
-#if PG_VERSION_NUM >= 160000
-	rnode.spcOid = pq_getmsgint(input_message, 4);
-	rnode.dbOid = pq_getmsgint(input_message, 4);
-	rnode.relNumber = pq_getmsgint(input_message, 4);
-#else
 	rnode.spcNode = pq_getmsgint(input_message, 4);
 	rnode.dbNode = pq_getmsgint(input_message, 4);
 	rnode.relNode = pq_getmsgint(input_message, 4);
-#endif
 	blknum = pq_getmsgint(input_message, 4);

 	/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
@@ -1029,11 +961,7 @@ GetPage(StringInfo input_message)
 	} while (tot_written < BLCKSZ);

 	ReleaseBuffer(buf);
-#if PG_VERSION_NUM >= 160000
-	DropRelationAllLocalBuffers(rnode);
-#else
 	DropRelFileNodeAllLocalBuffers(rnode);
-#endif
 	wal_redo_buffer = InvalidBuffer;

 	elog(TRACE, "Page sent back for block %u", blknum);
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,60 +2,111 @@

 [[package]]
 name = "aiohttp"
-version = "3.7.4"
+version = "3.8.5"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "aiohttp-3.7.4-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:6c8200abc9dc5f27203986100579fc19ccad7a832c07d2bc151ce4ff17190076"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:dd7936f2a6daa861143e376b3a1fb56e9b802f4980923594edd9ca5670974895"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:bc3d14bf71a3fb94e5acf5bbf67331ab335467129af6416a437bd6024e4f743d"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:8ec1a38074f68d66ccb467ed9a673a726bb397142c273f90d4ba954666e87d54"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:b84ad94868e1e6a5e30d30ec419956042815dfaea1b1df1cef623e4564c374d9"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:d5d102e945ecca93bcd9801a7bb2fa703e37ad188a2f81b1e65e4abe4b51b00c"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:c2a80fd9a8d7e41b4e38ea9fe149deed0d6aaede255c497e66b8213274d6d61b"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-win32.whl", hash = "sha256:481d4b96969fbfdcc3ff35eea5305d8565a8300410d3d269ccac69e7256b1329"},
-    {file = "aiohttp-3.7.4-cp36-cp36m-win_amd64.whl", hash = "sha256:16d0683ef8a6d803207f02b899c928223eb219111bd52420ef3d7a8aa76227b6"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:eab51036cac2da8a50d7ff0ea30be47750547c9aa1aa2cf1a1b710a1827e7dbe"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:feb24ff1226beeb056e247cf2e24bba5232519efb5645121c4aea5b6ad74c1f2"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:119feb2bd551e58d83d1b38bfa4cb921af8ddedec9fad7183132db334c3133e0"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:6ca56bdfaf825f4439e9e3673775e1032d8b6ea63b8953d3812c71bd6a8b81de"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:5563ad7fde451b1986d42b9bb9140e2599ecf4f8e42241f6da0d3d624b776f40"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:62bc216eafac3204877241569209d9ba6226185aa6d561c19159f2e1cbb6abfb"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:f4496d8d04da2e98cc9133e238ccebf6a13ef39a93da2e87146c8c8ac9768242"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-win32.whl", hash = "sha256:2ffea7904e70350da429568113ae422c88d2234ae776519549513c8f217f58a9"},
-    {file = "aiohttp-3.7.4-cp37-cp37m-win_amd64.whl", hash = "sha256:5e91e927003d1ed9283dee9abcb989334fc8e72cf89ebe94dc3e07e3ff0b11e9"},
-    {file = "aiohttp-3.7.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:4c1bdbfdd231a20eee3e56bd0ac1cd88c4ff41b64ab679ed65b75c9c74b6c5c2"},
-    {file = "aiohttp-3.7.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:71680321a8a7176a58dfbc230789790639db78dad61a6e120b39f314f43f1907"},
-    {file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:7dbd087ff2f4046b9b37ba28ed73f15fd0bc9f4fdc8ef6781913da7f808d9536"},
-    {file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:dee68ec462ff10c1d836c0ea2642116aba6151c6880b688e56b4c0246770f297"},
-    {file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:99c5a5bf7135607959441b7d720d96c8e5c46a1f96e9d6d4c9498be8d5f24212"},
-    {file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:5dde6d24bacac480be03f4f864e9a67faac5032e28841b00533cd168ab39cad9"},
-    {file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:418597633b5cd9639e514b1d748f358832c08cd5d9ef0870026535bd5eaefdd0"},
-    {file = "aiohttp-3.7.4-cp38-cp38-win32.whl", hash = "sha256:e76e78863a4eaec3aee5722d85d04dcbd9844bc6cd3bfa6aa880ff46ad16bfcb"},
-    {file = "aiohttp-3.7.4-cp38-cp38-win_amd64.whl", hash = "sha256:950b7ef08b2afdab2488ee2edaff92a03ca500a48f1e1aaa5900e73d6cf992bc"},
-    {file = "aiohttp-3.7.4-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:2eb3efe243e0f4ecbb654b08444ae6ffab37ac0ef8f69d3a2ffb958905379daf"},
-    {file = "aiohttp-3.7.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:822bd4fd21abaa7b28d65fc9871ecabaddc42767884a626317ef5b75c20e8a2d"},
-    {file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:58c62152c4c8731a3152e7e650b29ace18304d086cb5552d317a54ff2749d32a"},
-    {file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:7c7820099e8b3171e54e7eedc33e9450afe7cd08172632d32128bd527f8cb77d"},
-    {file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:5b50e0b9460100fe05d7472264d1975f21ac007b35dcd6fd50279b72925a27f4"},
-    {file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:c44d3c82a933c6cbc21039326767e778eface44fca55c65719921c4b9661a3f7"},
-    {file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cc31e906be1cc121ee201adbdf844522ea3349600dd0a40366611ca18cd40e81"},
-    {file = "aiohttp-3.7.4-cp39-cp39-win32.whl", hash = "sha256:fbd3b5e18d34683decc00d9a360179ac1e7a320a5fee10ab8053ffd6deab76e0"},
-    {file = "aiohttp-3.7.4-cp39-cp39-win_amd64.whl", hash = "sha256:40bd1b101b71a18a528ffce812cc14ff77d4a2a1272dfb8b11b200967489ef3e"},
-    {file = "aiohttp-3.7.4.tar.gz", hash = "sha256:5d84ecc73141d0a0d61ece0742bb7ff5751b0657dab8405f899d3ceb104cc7de"},
+    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
+    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
+    {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
+    {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
+    {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
+    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
+    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
+    {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
+    {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
+    {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
+    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
+    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
+    {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
+    {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
+    {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
+    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
+    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
+    {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
+    {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
+    {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
+    {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
 ]

 [package.dependencies]
-async-timeout = ">=3.0,<4.0"
+aiosignal = ">=1.1.2"
+async-timeout = ">=4.0.0a3,<5.0"
 attrs = ">=17.3.0"
-chardet = ">=2.0,<4.0"
+charset-normalizer = ">=2.0,<4.0"
+frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
-typing-extensions = ">=3.6.5"
 yarl = ">=1.0,<2.0"

 [package.extras]
-speedups = ["aiodns", "brotlipy", "cchardet"]
+speedups = ["Brotli", "aiodns", "cchardet"]

 [[package]]
 name = "aiopg"
@@ -75,6 +126,20 @@ psycopg2-binary = ">=2.8.4"
 [package.extras]
 sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]

+[[package]]
+name = "aiosignal"
+version = "1.3.1"
+description = "aiosignal: a list of registered asynchronous callbacks"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
+    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+]
+
+[package.dependencies]
+frozenlist = ">=1.1.0"
+
 [[package]]
 name = "allure-pytest"
 version = "2.13.2"
@@ -107,13 +172,13 @@ pluggy = ">=0.4.0"

 [[package]]
 name = "async-timeout"
-version = "3.0.1"
+version = "4.0.2"
 description = "Timeout context manager for asyncio programs"
 optional = false
-python-versions = ">=3.5.3"
+python-versions = ">=3.6"
 files = [
-    {file = "async-timeout-3.0.1.tar.gz", hash = "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f"},
-    {file = "async_timeout-3.0.1-py3-none-any.whl", hash = "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"},
+    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
+    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
 ]

 [[package]]
@@ -675,13 +740,13 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2022.12.7"
+version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
-    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
 ]

 [[package]]
@@ -781,17 +846,6 @@ networkx = ">=2.4,<3.0"
 pyyaml = ">5.4"
 sarif-om = ">=1.0.4,<1.1.0"

-[[package]]
-name = "chardet"
-version = "3.0.4"
-description = "Universal encoding detector for Python 2 and 3"
-optional = false
-python-versions = "*"
-files = [
-    {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
-    {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
-]
-
 [[package]]
 name = "charset-normalizer"
 version = "2.1.0"
@@ -980,6 +1034,76 @@ files = [
 Flask = ">=0.9"
 Six = "*"

+[[package]]
+name = "frozenlist"
+version = "1.4.0"
+description = "A list-like structure which implements collections.abc.MutableSequence"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"},
+    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"},
+    {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"},
+    {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"},
+    {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"},
+    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"},
+    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"},
+    {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"},
+    {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"},
+    {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"},
+    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"},
+    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"},
+    {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"},
+    {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"},
+    {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"},
+    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"},
+    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"},
+    {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"},
+    {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"},
+    {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"},
+    {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"},
+]
+
 [[package]]
 name = "graphql-core"
 version = "3.2.1"
@@ -1868,6 +1992,20 @@ files = [
 packaging = ">=17.1"
 pytest = ">=5.3"

+[[package]]
+name = "pytest-split"
+version = "0.8.1"
+description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time."
+optional = false
+python-versions = ">=3.7.1,<4.0"
+files = [
+    {file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"},
+    {file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"},
+]
+
+[package.dependencies]
+pytest = ">=5,<8"
+
 [[package]]
 name = "pytest-timeout"
 version = "2.1.0"
@@ -2513,4 +2651,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "fe771b153ef7e308d6d04421d0eb3f97d00780882277d2b4fc1f296054d8db79"
+content-hash = "c40f62277e788011920f4edb6f7392046ee440f792a104c903097415def9a916"
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,8 +1,11 @@
+use std::ops::ControlFlow;
+
 use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
+    proxy::{try_wake, NUM_RETRIES_CONNECT},
    sasl, scram,
    stream::PqStream,
 };
@@ -48,7 +51,15 @@ pub(super) async fn authenticate(
        }
    };

-    let mut node = api.wake_compute(extra, creds).await?;
+    let mut num_retries = 0;
+    let mut node = loop {
+        num_retries += 1;
+        match try_wake(api, extra, creds).await? {
+            ControlFlow::Break(n) => break n,
+            ControlFlow::Continue(_) if num_retries < NUM_RETRIES_CONNECT => continue,
+            ControlFlow::Continue(e) => return Err(e.into()),
+        }
+    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
        node.config.auth_keys(AuthKeys::ScramSha256(keys));
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -48,6 +48,14 @@ impl ClientCredentials<'_> {
 }

 impl<'a> ClientCredentials<'a> {
+    #[cfg(test)]
+    pub fn new_noop() -> Self {
+        ClientCredentials {
+            user: "",
+            project: None,
+        }
+    }
+
    pub fn parse(
        params: &'a StartupMessageParams,
        sni: Option<&str>,
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -262,24 +262,21 @@ pub mod timed_lru {
        token: Option<(C, C::LookupInfo<C::Key>)>,

        /// The value itself.
-        pub value: C::Value,
+        value: C::Value,
    }

    impl<C: Cache> Cached<C> {
        /// Place any entry into this wrapper; invalidation will be a no-op.
-        /// Unfortunately, rust doesn't let us implement [`From`] or [`Into`].
-        pub fn new_uncached(value: impl Into<C::Value>) -> Self {
-            Self {
-                token: None,
-                value: value.into(),
-            }
+        pub fn new_uncached(value: C::Value) -> Self {
+            Self { token: None, value }
        }

        /// Drop this entry from a cache if it's still there.
-        pub fn invalidate(&self) {
+        pub fn invalidate(self) -> C::Value {
            if let Some((cache, info)) = &self.token {
                cache.invalidate(info);
            }
+            self.value
        }

        /// Tell if this entry is actually cached.
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,4 +1,9 @@
-use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
+use crate::{
+    auth::parse_endpoint_param,
+    cancellation::CancelClosure,
+    console::errors::WakeComputeError,
+    error::{io_error, UserFacingError},
+};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -24,6 +29,12 @@ pub enum ConnectionError {
    TlsError(#[from] native_tls::Error),
 }

+impl From<WakeComputeError> for ConnectionError {
+    fn from(value: WakeComputeError) -> Self {
+        io_error(value).into()
+    }
+}
+
 impl UserFacingError for ConnectionError {
    fn to_string_client(&self) -> String {
        use ConnectionError::*;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,6 +14,7 @@ pub mod errors {
    use crate::{
        error::{io_error, UserFacingError},
        http,
+        proxy::ShouldRetry,
    };
    use thiserror::Error;

@@ -72,6 +73,24 @@ pub mod errors {
        }
    }

+    impl ShouldRetry for ApiError {
+        fn could_retry(&self) -> bool {
+            match self {
+                // retry some transport errors
+                Self::Transport(io) => io.could_retry(),
+                // retry some temporary failures because the compute was in a bad state
+                // (bad request can be returned when the endpoint was in transition)
+                Self::Console {
+                    status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
+                    ..
+                } => true,
+                // retry server errors
+                Self::Console { status, .. } if status.is_server_error() => true,
+                _ => false,
+            }
+        }
+    }
+
    impl From<reqwest::Error> for ApiError {
        fn from(e: reqwest::Error) -> Self {
            io_error(e).into()
@@ -186,14 +205,14 @@ pub trait Api {
    async fn get_auth_info(
        &self,
        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        creds: &ClientCredentials,
    ) -> Result<Option<AuthInfo>, errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        creds: &ClientCredentials,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }

--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -106,7 +106,7 @@ impl super::Api for Api {
    async fn get_auth_info(
        &self,
        _extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        creds: &ClientCredentials,
    ) -> Result<Option<AuthInfo>, GetAuthInfoError> {
        self.do_get_auth_info(creds).await
    }
@@ -115,7 +115,7 @@ impl super::Api for Api {
    async fn wake_compute(
        &self,
        _extra: &ConsoleReqExtra<'_>,
-        _creds: &ClientCredentials<'_>,
+        _creds: &ClientCredentials,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        self.do_wake_compute()
            .map_ok(CachedNodeInfo::new_uncached)
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -123,7 +123,7 @@ impl super::Api for Api {
    async fn get_auth_info(
        &self,
        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        creds: &ClientCredentials,
    ) -> Result<Option<AuthInfo>, GetAuthInfoError> {
        self.do_get_auth_info(extra, creds).await
    }
@@ -132,7 +132,7 @@ impl super::Api for Api {
    async fn wake_compute(
        &self,
        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials<'_>,
+        creds: &ClientCredentials,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        let key = creds.project().expect("impossible");

--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,19 +1,17 @@
+use anyhow::Context;
+use async_trait::async_trait;
 use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
-use std::ops::ControlFlow;
 use std::{collections::HashMap, sync::Arc};
 use tokio::time;

-use crate::config;
 use crate::{auth, console};
+use crate::{compute, config};

 use super::sql_over_http::MAX_RESPONSE_SIZE;

-use crate::proxy::{
-    can_retry_tokio_postgres_error, invalidate_cache, retry_after, try_wake,
-    NUM_RETRIES_WAKE_COMPUTE,
-};
+use crate::proxy::ConnectMechanism;

 use tracing::error;
 use tracing::info;
@@ -187,6 +185,27 @@ impl GlobalConnPool {
    }
 }

+struct TokioMechanism<'a> {
+    conn_info: &'a ConnInfo,
+}
+
+#[async_trait]
+impl ConnectMechanism for TokioMechanism<'_> {
+    type Connection = tokio_postgres::Client;
+    type ConnectError = tokio_postgres::Error;
+    type Error = anyhow::Error;
+
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError> {
+        connect_to_compute_once(node_info, self.conn_info, timeout).await
+    }
+
+    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
+}
+
 // Wake up the destination if needed. Code here is a bit involved because
 // we reuse the code from the usual proxy and we need to prepare few structures
 // that this code expects.
@@ -220,72 +239,18 @@ async fn connect_to_compute(
        application_name: Some(APP_NAME),
    };

-    let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
+    let node_info = creds
+        .wake_compute(&extra)
+        .await?
+        .context("missing cache entry from wake_compute")?;

-    let mut num_retries = 0;
-    let mut wait_duration = time::Duration::ZERO;
-    let mut should_wake_with_error = None;
-    loop {
-        if !wait_duration.is_zero() {
-            time::sleep(wait_duration).await;
-        }
-
-        // try wake the compute node if we have determined it's sensible to do so
-        if let Some(err) = should_wake_with_error.take() {
-            match try_wake(node_info, &extra, &creds).await {
-                // we can't wake up the compute node
-                Ok(None) => return Err(err),
-                // there was an error communicating with the control plane
-                Err(e) => return Err(e.into()),
-                // failed to wake up but we can continue to retry
-                Ok(Some(ControlFlow::Continue(()))) => {
-                    wait_duration = retry_after(num_retries);
-                    should_wake_with_error = Some(err);
-
-                    num_retries += 1;
-                    info!(num_retries, "retrying wake compute");
-                    continue;
-                }
-                // successfully woke up a compute node and can break the wakeup loop
-                Ok(Some(ControlFlow::Break(()))) => {}
-            }
-        }
-
-        match connect_to_compute_once(node_info, conn_info).await {
-            Ok(res) => return Ok(res),
-            Err(e) => {
-                error!(error = ?e, "could not connect to compute node");
-                if !can_retry_error(&e, num_retries) {
-                    return Err(e.into());
-                }
-                wait_duration = retry_after(num_retries);
-
-                // after the first connect failure,
-                // we should invalidate the cache and wake up a new compute node
-                if num_retries == 0 {
-                    invalidate_cache(node_info);
-                    should_wake_with_error = Some(e.into());
-                }
-            }
-        }
-
-        num_retries += 1;
-        info!(num_retries, "retrying connect");
-    }
-}
-
-fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
-    match err {
-        // retry all errors at least once
-        _ if num_retries == 0 => true,
-        _ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
-        err => can_retry_tokio_postgres_error(err),
-    }
+    crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
 }

 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
+    timeout: time::Duration,
 ) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

@@ -294,6 +259,7 @@ async fn connect_to_compute_once(
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect_timeout(timeout)
        .connect(tokio_postgres::NoTls)
        .await?;

--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -11,6 +11,7 @@ use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
+use tokio_postgres::GenericClient;
 use tokio_postgres::Row;
 use url::Url;

@@ -23,6 +24,13 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

+#[derive(serde::Deserialize)]
+#[serde(untagged)]
+enum Payload {
+    Single(QueryData),
+    Batch(Vec<QueryData>),
+}
+
 pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

@@ -192,15 +200,53 @@ pub async fn handle(
    // Read the query and query params from the request body
    //
    let body = hyper::body::to_bytes(request.into_body()).await?;
-    let QueryData { query, params } = serde_json::from_slice(&body)?;
-    let query_params = json_to_pg_text(params)?;
+    let payload: Payload = serde_json::from_slice(&body)?;
+
+    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;

    //
    // Now execute the query and return the result
    //
-    let client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let result = match payload {
+        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode).await,
+        Payload::Batch(queries) => {
+            let mut results = Vec::new();
+            let transaction = client.transaction().await?;
+            for query in queries {
+                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
+                match result {
+                    Ok(r) => results.push(r),
+                    Err(e) => {
+                        transaction.rollback().await?;
+                        return Err(e);
+                    }
+                }
+            }
+            transaction.commit().await?;
+            Ok(json!({ "results": results }))
+        }
+    };

-    let row_stream = client.query_raw_txt(query, query_params).await?;
+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
+    result
+}
+
+async fn query_to_json<T: GenericClient>(
+    client: &T,
+    data: QueryData,
+    raw_output: bool,
+    array_mode: bool,
+) -> anyhow::Result<Value> {
+    let query_params = json_to_pg_text(data.params)?;
+    let row_stream = client
+        .query_raw_txt::<String, _>(data.query, query_params)
+        .await?;

    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
@@ -256,13 +302,6 @@ pub async fn handle(
        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

-    if allow_pool {
-        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
-        });
-    }
-
    // resulting JSON format is based on the format of node-postgres result
    Ok(json!({
        "command": command_tag_name,
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -1,5 +1,8 @@
 use crate::{
-    cancellation::CancelMap, config::ProxyConfig, error::io_error, proxy::handle_ws_client,
+    cancellation::CancelMap,
+    config::ProxyConfig,
+    error::io_error,
+    proxy::{handle_client, ClientMode},
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
@@ -150,12 +153,12 @@ async fn serve_websocket(
    hostname: Option<String>,
 ) -> anyhow::Result<()> {
    let websocket = websocket.await?;
-    handle_ws_client(
+    handle_client(
        config,
        cancel_map,
        session_id,
        WebSocketRw::new(websocket),
-        hostname,
+        ClientMode::Websockets { hostname },
    )
    .await?;
    Ok(())
@@ -178,13 +181,15 @@ async fn ws_handler(

    // Check if the request is a websocket upgrade request.
    if hyper_tungstenite::is_upgrade_request(&request) {
+        info!(session_id = ?session_id, "performing websocket upgrade");
+
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        tokio::spawn(async move {
            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
            {
-                error!("error in websocket connection: {e:?}");
+                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
            }
        });

@@ -221,6 +226,18 @@ async fn ws_handler(
            );
            r
        })
+    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
+        Response::builder()
+            .header("Allow", "OPTIONS, POST")
+            .header("Access-Control-Allow-Origin", "*")
+            .header(
+                "Access-Control-Allow-Headers",
+                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
+            )
+            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
+            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
+            .body(Body::empty())
+            .map_err(|e| ApiError::BadRequest(e.into()))
    } else {
        json_response(StatusCode::BAD_REQUEST, "query is not supported")
    }
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,21 +6,18 @@ use crate::{
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
-    console::{
-        self,
-        errors::{ApiError, WakeComputeError},
-        messages::MetricsAuxInfo,
-    },
-    error::io_error,
+    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo},
    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
+use async_trait::async_trait;
 use futures::TryFutureExt;
-use hyper::StatusCode;
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+use metrics::{
+    exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
+};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use std::{error::Error, ops::ControlFlow, sync::Arc};
+use std::{error::Error, io, ops::ControlFlow, sync::Arc};
 use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
    time,
@@ -31,24 +28,37 @@ use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
+pub const NUM_RETRIES_CONNECT: u32 = 10;
+const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);

 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";

-static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "proxy_accepted_connections_total",
-        "Number of TCP client connections accepted."
+        "Number of TCP client connections accepted.",
+        &["protocol"],
    )
    .unwrap()
 });

-static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "proxy_closed_connections_total",
-        "Number of TCP client connections closed."
+        "Number of TCP client connections closed.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_compute_connection_latency_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
    )
    .unwrap()
 });
@@ -103,7 +113,8 @@ pub async fn task_main(
                            .set_nodelay(true)
                            .context("failed to set socket option")?;

-                        handle_client(config, &cancel_map, session_id, socket).await
+                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp)
+                        .await
                    }
                    .unwrap_or_else(move |e| {
                        // Acknowledge that the task has finished with an error.
@@ -128,26 +139,74 @@ pub async fn task_main(
    Ok(())
 }

-// TODO(tech debt): unite this with its twin below.
+pub enum ClientMode {
+    Tcp,
+    Websockets { hostname: Option<String> },
+}
+
+/// Abstracts the logic of handling TCP vs WS clients
+impl ClientMode {
+    fn protocol_label(&self) -> &'static str {
+        match self {
+            ClientMode::Tcp => "tcp",
+            ClientMode::Websockets { .. } => "ws",
+        }
+    }
+
+    fn allow_cleartext(&self) -> bool {
+        match self {
+            ClientMode::Tcp => false,
+            ClientMode::Websockets { .. } => true,
+        }
+    }
+
+    fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+        match self {
+            ClientMode::Tcp => config.allow_self_signed_compute,
+            ClientMode::Websockets { .. } => false,
+        }
+    }
+
+    fn hostname<'a, S>(&'a self, s: &'a Stream<S>) -> Option<&'a str> {
+        match self {
+            ClientMode::Tcp => s.sni_hostname(),
+            ClientMode::Websockets { hostname } => hostname.as_deref(),
+        }
+    }
+
+    fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> {
+        match self {
+            ClientMode::Tcp => tls,
+            // TLS is None here if using websockets, because the connection is already encrypted.
+            ClientMode::Websockets { .. } => None,
+        }
+    }
+}
+
 #[tracing::instrument(fields(session_id = ?session_id), skip_all)]
-pub async fn handle_ws_client(
+pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
    session_id: uuid::Uuid,
-    stream: impl AsyncRead + AsyncWrite + Unpin,
-    hostname: Option<String>,
+    stream: S,
+    mode: ClientMode,
 ) -> anyhow::Result<()> {
+    info!(
+        protocol = mode.protocol_label(),
+        "handling interactive connection from client"
+    );
+
    // The `closed` counter will increase when this future is destroyed.
-    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
+    NUM_CONNECTIONS_ACCEPTED_COUNTER
+        .with_label_values(&[mode.protocol_label()])
+        .inc();
    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
+        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
    }

    let tls = config.tls_config.as_ref();
-    let hostname = hostname.as_deref();

-    // TLS is None here, because the connection is already encrypted.
-    let do_handshake = handshake(stream, None, cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
    let (mut stream, params) = match do_handshake.await? {
        Some(x) => x,
        None => return Ok(()), // it's a cancellation request
@@ -155,6 +214,7 @@ pub async fn handle_ws_client(

    // Extract credentials which we're going to use for auth.
    let creds = {
+        let hostname = mode.hostname(stream.get_ref());
        let common_names = tls.and_then(|tls| tls.common_names.clone());
        let result = config
            .auth_backend
@@ -168,59 +228,15 @@ pub async fn handle_ws_client(
        }
    };

-    let client = Client::new(stream, creds, &params, session_id, false);
-    cancel_map
-        .with_session(|session| client.connect_to_db(session, true))
-        .await
-}
-
-#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
-async fn handle_client(
-    config: &'static ProxyConfig,
-    cancel_map: &CancelMap,
-    session_id: uuid::Uuid,
-    stream: impl AsyncRead + AsyncWrite + Unpin,
-) -> anyhow::Result<()> {
-    // The `closed` counter will increase when this future is destroyed.
-    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
-    scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
-    }
-
-    let tls = config.tls_config.as_ref();
-    let do_handshake = handshake(stream, tls, cancel_map);
-    let (mut stream, params) = match do_handshake.await? {
-        Some(x) => x,
-        None => return Ok(()), // it's a cancellation request
-    };
-
-    // Extract credentials which we're going to use for auth.
-    let creds = {
-        let sni = stream.get_ref().sni_hostname();
-        let common_names = tls.and_then(|tls| tls.common_names.clone());
-        let result = config
-            .auth_backend
-            .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_names))
-            .transpose();
-
-        match result {
-            Ok(creds) => creds,
-            Err(e) => stream.throw_error(e).await?,
-        }
-    };
-
-    let allow_self_signed_compute = config.allow_self_signed_compute;
-
    let client = Client::new(
        stream,
        creds,
        &params,
        session_id,
-        allow_self_signed_compute,
+        mode.allow_self_signed_compute(config),
    );
    cancel_map
-        .with_session(|session| client.connect_to_db(session, false))
+        .with_session(|session| client.connect_to_db(session, mode.allow_cleartext()))
        .await
 }

@@ -303,18 +319,18 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
    let is_cached = node_info.cached();
    if is_cached {
        warn!("invalidating stalled compute node info cache entry");
-        node_info.invalidate();
    }
-
    let label = match is_cached {
        true => "compute_cached",
        false => "compute_uncached",
    };
    NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
+
+    node_info.invalidate().config
 }

 /// Try to connect to the compute node once.
@@ -331,157 +347,208 @@ async fn connect_to_compute_once(
        .await
 }

+enum ConnectionState<E> {
+    Cached(console::CachedNodeInfo),
+    Invalid(compute::ConnCfg, E),
+}
+
+#[async_trait]
+pub trait ConnectMechanism {
+    type Connection;
+    type ConnectError;
+    type Error: From<Self::ConnectError>;
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError>;
+
+    fn update_connect_config(&self, conf: &mut compute::ConnCfg);
+}
+
+pub struct TcpMechanism<'a> {
+    /// KV-dictionary with PostgreSQL connection params.
+    pub params: &'a StartupMessageParams,
+}
+
+#[async_trait]
+impl ConnectMechanism for TcpMechanism<'_> {
+    type Connection = PostgresConnection;
+    type ConnectError = compute::ConnectionError;
+    type Error = compute::ConnectionError;
+
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<PostgresConnection, Self::Error> {
+        connect_to_compute_once(node_info, timeout).await
+    }
+
+    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
+        config.set_startup_params(self.params);
+    }
+}
+
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
-async fn connect_to_compute(
-    node_info: &mut console::CachedNodeInfo,
-    params: &StartupMessageParams,
+pub async fn connect_to_compute<M: ConnectMechanism>(
+    mechanism: &M,
+    mut node_info: console::CachedNodeInfo,
    extra: &console::ConsoleReqExtra<'_>,
    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<PostgresConnection, compute::ConnectionError> {
+) -> Result<M::Connection, M::Error>
+where
+    M::ConnectError: ShouldRetry + std::fmt::Debug,
+    M::Error: From<WakeComputeError>,
+{
+    let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
+
+    mechanism.update_connect_config(&mut node_info.config);
+
    let mut num_retries = 0;
-    let mut wait_duration = time::Duration::ZERO;
-    let mut should_wake_with_error = None;
+    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
+
    loop {
-        // Apply startup params to the (possibly, cached) compute node info.
-        node_info.config.set_startup_params(params);
+        match state {
+            ConnectionState::Invalid(config, err) => {
+                let wake_res = match creds {
+                    auth::BackendType::Console(api, creds) => {
+                        try_wake(api.as_ref(), extra, creds).await
+                    }
+                    auth::BackendType::Postgres(api, creds) => {
+                        try_wake(api.as_ref(), extra, creds).await
+                    }
+                    // nothing to do?
+                    auth::BackendType::Link(_) => return Err(err.into()),
+                };

-        if !wait_duration.is_zero() {
-            time::sleep(wait_duration).await;
-        }
+                match wake_res {
+                    // there was an error communicating with the control plane
+                    Err(e) => return Err(e.into()),
+                    // failed to wake up but we can continue to retry
+                    Ok(ControlFlow::Continue(_)) => {
+                        state = ConnectionState::Invalid(config, err);
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;

-        // try wake the compute node if we have determined it's sensible to do so
-        if let Some(err) = should_wake_with_error.take() {
-            match try_wake(node_info, extra, creds).await {
-                // we can't wake up the compute node
-                Ok(None) => return Err(err),
-                // there was an error communicating with the control plane
-                Err(e) => return Err(io_error(e).into()),
-                // failed to wake up but we can continue to retry
-                Ok(Some(ControlFlow::Continue(()))) => {
-                    wait_duration = retry_after(num_retries);
-                    should_wake_with_error = Some(err);
-
-                    num_retries += 1;
-                    info!(num_retries, "retrying wake compute");
-                    continue;
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                        continue;
+                    }
+                    // successfully woke up a compute node and can break the wakeup loop
+                    Ok(ControlFlow::Break(mut node_info)) => {
+                        node_info.config.reuse_password(&config);
+                        mechanism.update_connect_config(&mut node_info.config);
+                        state = ConnectionState::Cached(node_info)
+                    }
                }
-                // successfully woke up a compute node and can break the wakeup loop
-                Ok(Some(ControlFlow::Break(()))) => {}
            }
-        }
+            ConnectionState::Cached(node_info) => {
+                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+                    Ok(res) => return Ok(res),
+                    Err(e) => {
+                        error!(error = ?e, "could not connect to compute node");
+                        if !e.should_retry(num_retries) {
+                            return Err(e.into());
+                        }

-        // Set a shorter timeout for the initial connection attempt.
-        //
-        // In case we try to connect to an outdated address that is no longer valid, the
-        // default behavior of Kubernetes is to drop the packets, causing us to wait for
-        // the entire timeout period. We want to fail fast in such cases.
-        //
-        // A specific case to consider is when we have cached compute node information
-        // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API
-        // call, resulting in the nonexistence of the compute node.
-        //
-        // We only use caching in case of scram proxy backed by the console, so reduce
-        // the timeout only in that case.
-        let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
-        let timeout = if is_scram_proxy && num_retries == 0 {
-            time::Duration::from_secs(2)
-        } else {
-            time::Duration::from_secs(10)
-        };
+                        // after the first connect failure,
+                        // we should invalidate the cache and wake up a new compute node
+                        if num_retries == 0 {
+                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
+                        } else {
+                            state = ConnectionState::Cached(node_info);
+                        }

-        // do this again to ensure we have username?
-        node_info.config.set_startup_params(params);
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;

-        match connect_to_compute_once(node_info, timeout).await {
-            Ok(res) => return Ok(res),
-            Err(e) => {
-                error!(error = ?e, "could not connect to compute node");
-                if !can_retry_error(&e, num_retries) {
-                    return Err(e);
-                }
-                wait_duration = retry_after(num_retries);
-
-                // after the first connect failure,
-                // we should invalidate the cache and wake up a new compute node
-                if num_retries == 0 {
-                    invalidate_cache(node_info);
-                    should_wake_with_error = Some(e);
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                    }
                }
            }
        }
-
-        num_retries += 1;
-        info!(num_retries, "retrying connect");
    }
 }

 /// Attempts to wake up the compute node.
-/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Some(false)) if the wakeup succeeded
-/// * Returns Ok(None) or Err(e) if there was an error
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
 pub async fn try_wake(
-    node_info: &mut console::CachedNodeInfo,
+    api: &impl console::Api,
    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<Option<ControlFlow<()>>, WakeComputeError> {
+    creds: &auth::ClientCredentials<'_>,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
    info!("compute node's state has likely changed; requesting a wake-up");
-    match creds.wake_compute(extra).await {
-        // retry wake if the compute was in an invalid state
-        Err(WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        })) => Ok(Some(ControlFlow::Continue(()))),
-        // Update `node_info` and try again.
-        Ok(Some(mut new)) => {
-            new.config.reuse_password(&node_info.config);
-            *node_info = new;
-            Ok(Some(ControlFlow::Break(())))
+    match api.wake_compute(extra, creds).await {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
+
+pub trait ShouldRetry {
+    fn could_retry(&self) -> bool;
+    fn should_retry(&self, num_retries: u32) -> bool {
+        match self {
+            // retry all errors at least once
+            _ if num_retries == 0 => true,
+            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            err => err.could_retry(),
        }
-        Err(e) => Err(e),
-        Ok(None) => Ok(None),
    }
 }

-fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
-    match err {
-        // retry all errors at least once
-        _ if num_retries == 0 => true,
-        _ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
-        compute::ConnectionError::Postgres(err) => can_retry_tokio_postgres_error(err),
-        compute::ConnectionError::CouldNotConnect(err) => is_io_connection_err(err),
-        _ => false,
+impl ShouldRetry for io::Error {
+    fn could_retry(&self) -> bool {
+        use std::io::ErrorKind;
+        matches!(
+            self.kind(),
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
+        )
    }
 }

-pub fn can_retry_tokio_postgres_error(err: &tokio_postgres::Error) -> bool {
-    if let Some(io_err) = err.source().and_then(|x| x.downcast_ref()) {
-        is_io_connection_err(io_err)
-    } else if let Some(db_err) = err.source().and_then(|x| x.downcast_ref()) {
-        is_sql_connection_err(db_err)
-    } else {
-        false
+impl ShouldRetry for tokio_postgres::error::DbError {
+    fn could_retry(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        matches!(
+            self.code(),
+            &SqlState::CONNECTION_FAILURE
+                | &SqlState::CONNECTION_EXCEPTION
+                | &SqlState::CONNECTION_DOES_NOT_EXIST
+                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        )
    }
 }

-fn is_sql_connection_err(err: &tokio_postgres::error::DbError) -> bool {
-    use tokio_postgres::error::SqlState;
-    matches!(
-        err.code(),
-        &SqlState::CONNECTION_FAILURE
-            | &SqlState::CONNECTION_EXCEPTION
-            | &SqlState::CONNECTION_DOES_NOT_EXIST
-            | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-    )
+impl ShouldRetry for tokio_postgres::Error {
+    fn could_retry(&self) -> bool {
+        if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
+            io::Error::could_retry(io_err)
+        } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::could_retry(db_err)
+        } else {
+            false
+        }
+    }
 }

-fn is_io_connection_err(err: &std::io::Error) -> bool {
-    use std::io::ErrorKind;
-    matches!(
-        err.kind(),
-        ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
-    )
+impl ShouldRetry for compute::ConnectionError {
+    fn could_retry(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.could_retry(),
+            compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
+            _ => false,
+        }
+    }
 }

 pub fn retry_after(num_retries: u32) -> time::Duration {
@@ -637,7 +704,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {

        node_info.allow_self_signed_compute = allow_self_signed_compute;

-        let mut node = connect_to_compute(&mut node_info, params, &extra, &creds)
+        let aux = node_info.aux.clone();
+        let mut node = connect_to_compute(&TcpMechanism { params }, node_info, &extra, &creds)
            .or_else(|e| stream.throw_error(e))
            .await?;

@@ -648,6 +716,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        // immediately after opening the connection.
        let (stream, read_buf) = stream.into_inner();
        node.stream.write_all(&read_buf).await?;
-        proxy_pass(stream, node.stream, &node_info.aux).await
+        proxy_pass(stream, node.stream, &aux).await
    }
 }
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,5 +1,9 @@
 //! A group of high-level tests for connection establishing logic and auth.
+use std::borrow::Cow;
+
 use super::*;
+use crate::auth::ClientCredentials;
+use crate::console::{CachedNodeInfo, NodeInfo};
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
@@ -304,3 +308,148 @@ fn connect_compute_total_wait() {
    assert!(total_wait < tokio::time::Duration::from_secs(12));
    assert!(total_wait > tokio::time::Duration::from_secs(10));
 }
+
+#[derive(Clone, Copy)]
+enum ConnectAction {
+    Connect,
+    Retry,
+    Fail,
+}
+
+struct TestConnectMechanism {
+    counter: Arc<std::sync::Mutex<usize>>,
+    sequence: Vec<ConnectAction>,
+}
+
+impl TestConnectMechanism {
+    fn new(sequence: Vec<ConnectAction>) -> Self {
+        Self {
+            counter: Arc::new(std::sync::Mutex::new(0)),
+            sequence,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct TestConnection;
+
+#[derive(Debug)]
+struct TestConnectError {
+    retryable: bool,
+}
+
+impl std::fmt::Display for TestConnectError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+impl std::error::Error for TestConnectError {}
+
+impl ShouldRetry for TestConnectError {
+    fn could_retry(&self) -> bool {
+        self.retryable
+    }
+}
+
+#[async_trait]
+impl ConnectMechanism for TestConnectMechanism {
+    type Connection = TestConnection;
+    type ConnectError = TestConnectError;
+    type Error = anyhow::Error;
+
+    async fn connect_once(
+        &self,
+        _node_info: &console::CachedNodeInfo,
+        _timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError> {
+        let mut counter = self.counter.lock().unwrap();
+        let action = self.sequence[*counter];
+        *counter += 1;
+        match action {
+            ConnectAction::Connect => Ok(TestConnection),
+            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
+            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+        }
+    }
+
+    fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
+}
+
+fn helper_create_connect_info() -> (
+    CachedNodeInfo,
+    console::ConsoleReqExtra<'static>,
+    auth::BackendType<'static, ClientCredentials<'static>>,
+) {
+    let node = NodeInfo {
+        config: compute::ConnCfg::new(),
+        aux: Default::default(),
+        allow_self_signed_compute: false,
+    };
+    let cache = CachedNodeInfo::new_uncached(node);
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some("TEST"),
+    };
+    let url = "https://TEST_URL".parse().unwrap();
+    let api = console::provider::mock::Api::new(url);
+    let creds = auth::BackendType::Postgres(Cow::Owned(api), ClientCredentials::new_noop());
+    (cache, extra, creds)
+}
+
+#[tokio::test]
+async fn connect_to_compute_success() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Connect]);
+    let (cache, extra, creds) = helper_create_connect_info();
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+}
+
+#[tokio::test]
+async fn connect_to_compute_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info();
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+}
+
+/// Test that we don't retry if the error is not retryable.
+#[tokio::test]
+async fn connect_to_compute_non_retry_1() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Fail]);
+    let (cache, extra, creds) = helper_create_connect_info();
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap_err();
+}
+
+/// Even for non-retryable errors, we should retry at least once.
+#[tokio::test]
+async fn connect_to_compute_non_retry_2() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Fail, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info();
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+}
+
+/// Retry for at most `NUM_RETRIES_CONNECT` times.
+#[tokio::test]
+async fn connect_to_compute_non_retry_3() {
+    assert_eq!(NUM_RETRIES_CONNECT, 10);
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![
+        Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        /* the 11th time */ Retry,
+    ]);
+    let (cache, extra, creds) = helper_create_connect_info();
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap_err();
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,9 +33,10 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.7.4"
+aiohttp = "3.8.5"
 pytest-rerunfailures = "^11.1.2"
 types-pytest-lazy-fixture = "^0.6.3.3"
+pytest-split = "^0.8.1"

 [tool.poetry.group.dev.dependencies]
 black = "^23.3.0"
@@ -78,6 +79,7 @@ module = [
 ignore_missing_imports = true

 [tool.ruff]
+target-version = "py39"
 extend-exclude = ["vendor/"]
 ignore = ["E501"]
 select = [
@@ -85,4 +87,5 @@ select = [
    "F", # Pyflakes
    "I", # isort
    "W", # pycodestyle
+    "B", # bugbear
 ]
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -37,7 +37,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
-use utils::auth::JwtAuth;
+use utils::auth::{JwtAuth, Scope};
 use utils::{
    id::NodeId,
    logging::{self, LogFormat},
@@ -72,6 +72,10 @@ struct Args {
    /// Listen endpoint for receiving/sending WAL in the form host:port.
    #[arg(short, long, default_value = DEFAULT_PG_LISTEN_ADDR)]
    listen_pg: String,
+    /// Listen endpoint for receiving/sending WAL in the form host:port allowing
+    /// only tenant scoped auth tokens. Pointless if auth is disabled.
+    #[arg(long, default_value = None, verbatim_doc_comment)]
+    listen_pg_tenant_only: Option<String>,
    /// Listen http endpoint for management and metrics in the form host:port.
    #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
    listen_http: String,
@@ -94,7 +98,7 @@ struct Args {
    broker_keepalive_interval: Duration,
    /// Peer safekeeper is considered dead after not receiving heartbeats from
    /// it during this period passed as a human readable duration.
-    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)]
+    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
    heartbeat_timeout: Duration,
    /// Remote storage configuration for WAL backup (offloading to s3) as TOML
    /// inline table, e.g.
@@ -179,6 +183,7 @@ async fn main() -> anyhow::Result<()> {
        workdir,
        my_id: id,
        listen_pg_addr: args.listen_pg,
+        listen_pg_addr_tenant_only: args.listen_pg_tenant_only,
        listen_http_addr: args.listen_http,
        availability_zone: args.availability_zone,
        no_sync: args.no_sync,
@@ -222,6 +227,21 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        e
    })?;

+    let pg_listener_tenant_only =
+        if let Some(listen_pg_addr_tenant_only) = &conf.listen_pg_addr_tenant_only {
+            info!(
+                "starting safekeeper tenant scoped WAL service on {}",
+                listen_pg_addr_tenant_only
+            );
+            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
+                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+                e
+            })?;
+            Some(listener)
+        } else {
+            None
+        };
+
    info!(
        "starting safekeeper HTTP service on {}",
        conf.listen_http_addr
@@ -253,14 +273,34 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let current_thread_rt = conf
        .current_thread_runtime
        .then(|| Handle::try_current().expect("no runtime in main"));
+
    let wal_service_handle = current_thread_rt
        .as_ref()
        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
-        .spawn(wal_service::task_main(conf_, pg_listener))
+        .spawn(wal_service::task_main(
+            conf_,
+            pg_listener,
+            Some(Scope::SafekeeperData),
+        ))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
    tasks_handles.push(Box::pin(wal_service_handle));

+    if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
+        let conf_ = conf.clone();
+        let wal_service_handle = current_thread_rt
+            .as_ref()
+            .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
+            .spawn(wal_service::task_main(
+                conf_,
+                pg_listener_tenant_only,
+                Some(Scope::Tenant),
+            ))
+            // wrap with task name for error reporting
+            .map(|res| ("WAL service tenant only main".to_owned(), res));
+        tasks_handles.push(Box::pin(wal_service_handle));
+    }
+
    let conf_ = conf.clone();
    let http_handle = current_thread_rt
        .as_ref()
--- a/Show More
+++ b/Show More