From eb36403e71210b1be7e2482fc385b8da8c149d5f Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:06:35 +0100
Subject: [PATCH] Release 2023 01 31 (#3497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Sergey Melnikov <sergey@neon.tech>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
---
 .../actions/run-python-test-set/action.yml    |   4 +-
 .github/ansible/deploy.yaml                   |   6 +-
 .../dev-us-east-2-beta.neon-proxy-link.yaml   |   1 +
 ...prod-us-east-2-delta.neon-proxy-link.yaml} |  24 +-
 ...us-west-2-eta.neon-proxy-scram-legacy.yaml |  61 ++
 .github/workflows/build_and_test.yml          | 572 +++---------
 .github/workflows/deploy-dev.yml              | 179 ++++
 .github/workflows/deploy-prod.yml             | 277 ++++++
 .github/workflows/release.yml                 |  33 +
 Cargo.lock                                    | 328 ++++---
 Cargo.toml                                    |  10 +-
 ...ompute-node-v14 => Dockerfile.compute-node |  34 +-
 Dockerfile.compute-node-v15                   | 220 -----
 compute_tools/Cargo.toml                      |   3 +
 compute_tools/src/bin/compute_ctl.rs          |  32 +-
 compute_tools/src/http/api.rs                 |  27 +-
 compute_tools/src/logger.rs                   |  24 +-
 compute_tools/src/params.rs                   |   8 +-
 compute_tools/src/spec.rs                     |  23 +-
 libs/metrics/src/lib.rs                       |   1 +
 libs/pageserver_api/src/models.rs             |  37 +-
 libs/tracing-utils/Cargo.toml                 |  17 +
 libs/tracing-utils/src/http.rs                |  96 ++
 libs/tracing-utils/src/lib.rs                 | 168 ++++
 libs/utils/Cargo.toml                         |   1 +
 libs/utils/src/http/error.rs                  |  17 +-
 libs/utils/src/logging.rs                     |   2 +-
 pageserver/Cargo.toml                         |   3 +-
 pageserver/benches/bench_layer_map.rs         | 224 ++---
 pageserver/src/basebackup.rs                  |  50 +-
 pageserver/src/bin/pageserver.rs              |  57 +-
 pageserver/src/broker_client.rs               |  48 +
 pageserver/src/config.rs                      |  28 +
 pageserver/src/consumption_metrics.rs         |  24 +-
 pageserver/src/context.rs                     | 199 ++++
 pageserver/src/http/openapi_spec.yml          |  10 +-
 pageserver/src/http/routes.rs                 | 170 ++--
 pageserver/src/import_datadir.rs              |  52 +-
 pageserver/src/lib.rs                         |   3 +-
 pageserver/src/metrics.rs                     | 166 +++-
 pageserver/src/page_service.rs                | 223 +++--
 pageserver/src/pgdatadir_mapping.rs           | 169 ++--
 pageserver/src/repository.rs                  |  11 +
 pageserver/src/task_mgr.rs                    |  45 +-
 pageserver/src/tenant.rs                      | 576 ++++++------
 pageserver/src/tenant/config.rs               |   7 +-
 pageserver/src/tenant/layer_map.rs            | 877 +++++++++---------
 .../layer_map/historic_layer_coverage.rs      | 583 ++++++++++++
 .../src/tenant/layer_map/layer_coverage.rs    | 154 +++
 pageserver/src/tenant/mgr.rs                  | 236 +++--
 .../src/tenant/remote_timeline_client.rs      |  25 +-
 pageserver/src/tenant/size.rs                 |  16 +-
 pageserver/src/tenant/storage_layer.rs        |  47 +
 pageserver/src/tenant/tasks.rs                |  13 +-
 pageserver/src/tenant/timeline.rs             | 327 +++++--
 .../src/{ => tenant/timeline}/walreceiver.rs  |  44 -
 .../walreceiver/connection_manager.rs         |  55 +-
 .../walreceiver/walreceiver_connection.rs     |  29 +-
 pageserver/src/walingest.rs                   | 418 ++++++---
 pageserver/src/walredo.rs                     | 329 +++++--
 poetry.lock                                   | 247 ++++-
 proxy/src/main.rs                             |   4 +-
 pyproject.toml                                |   1 +
 safekeeper/src/bin/safekeeper.rs              |   7 +-
 scripts/force_layer_download.py               | 324 +++++++
 storage_broker/src/bin/storage_broker.rs      |   4 +-
 test_runner/fixtures/metrics.py               |  12 +-
 test_runner/regress/test_tenant_conf.py       |  55 +-
 test_runner/regress/test_tenant_detach.py     |  46 +-
 test_runner/regress/test_tenants.py           |  56 +-
 workspace_hack/Cargo.toml                     |   8 +-
 71 files changed, 5779 insertions(+), 2408 deletions(-)
 rename .github/helm-values/{production.proxy.yaml => prod-us-east-2-delta.neon-proxy-link.yaml} (80%)
 create mode 100644 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
 create mode 100644 .github/workflows/deploy-dev.yml
 create mode 100644 .github/workflows/deploy-prod.yml
 create mode 100644 .github/workflows/release.yml
 rename Dockerfile.compute-node-v14 => Dockerfile.compute-node (86%)
 delete mode 100644 Dockerfile.compute-node-v15
 create mode 100644 libs/tracing-utils/Cargo.toml
 create mode 100644 libs/tracing-utils/src/http.rs
 create mode 100644 libs/tracing-utils/src/lib.rs
 create mode 100644 pageserver/src/broker_client.rs
 create mode 100644 pageserver/src/context.rs
 create mode 100644 pageserver/src/tenant/layer_map/historic_layer_coverage.rs
 create mode 100644 pageserver/src/tenant/layer_map/layer_coverage.rs
 rename pageserver/src/{ => tenant/timeline}/walreceiver.rs (83%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/connection_manager.rs (96%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/walreceiver_connection.rs (94%)
 create mode 100644 scripts/force_layer_download.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 990c7e25a9..29b04a3478 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n4 uses four processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+          # -n16 uses sixteen processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist
diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml
index 4adc685684..a17dc9c78f 100644
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -117,7 +117,8 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
       tags:
       - pageserver
 
@@ -186,6 +187,7 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
       tags:
       - safekeeper
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index cb062f705d..157ae66ed1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
+  domain: "pg.neon.build"
   sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
similarity index 80%
rename from .github/helm-values/production.proxy.yaml
rename to .github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
index dbaf3cd096..eff24302bb 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
@@ -1,37 +1,37 @@
+# Helm chart values for neon-proxy-link.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
 settings:
   authBackend: "link"
   authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
   uri: "https://console.neon.tech/psql_session/"
+  domain: "pg.neon.tech"
   sentryEnvironment: "production"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
   zenith_service: proxy
   zenith_env: production
-  zenith_region: us-west-2
-  zenith_region_slug: oregon
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
 
 service:
+  type: LoadBalancer
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: proxy-release.local
-  type: LoadBalancer
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
 
 exposedService:
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
new file mode 100644
index 0000000000..3a5cde4b01
--- /dev/null
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -0,0 +1,61 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.cloud.neon.tech"
+  sentryEnvironment: "production"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionInterval: "10min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-west-2
+  zenith_region_slug: us-west-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
+  httpsPort: 443
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 918e568e27..89e12360f9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,4 +1,4 @@
-name: Test and Deploy
+name: Build and Test
 
 on:
   push:
@@ -19,10 +19,12 @@ concurrency:
 env:
   RUST_BACKTRACE: 1
   COPT: '-Werror'
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   tag:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
@@ -50,7 +52,7 @@ jobs:
         id: build-tag
 
   check-codestyle-python:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
       options: --init
@@ -85,7 +87,7 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -97,16 +99,16 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
@@ -133,7 +135,7 @@ jobs:
         run: cargo deny check
 
   build-neon:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -141,7 +143,6 @@ jobs:
       fail-fast: false
       matrix:
         build_type: [ debug, release ]
-
     env:
       BUILD_TYPE: ${{ matrix.build_type }}
       GIT_VERSION: ${{ github.sha }}
@@ -194,24 +195,26 @@ jobs:
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
           echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
           echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
 
+      # Disabled for now
       # Don't include the ~/.cargo/registry/src directory. It contains just
       # uncompressed versions of the crates in ~/.cargo/registry/cache
       # directory, and it's faster to let 'cargo' to rebuild it from the
       # compressed crates.
-      - name: Cache cargo deps
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
-          key: |
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+#      - name: Cache cargo deps
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+#          key: |
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
 
       - name: Cache postgres v14 build
         id: cache_pg_14
@@ -301,7 +304,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -334,7 +337,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -365,7 +368,7 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   merge-allure-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -402,7 +405,7 @@ jobs:
           DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
 
   coverage-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -418,16 +421,17 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       - name: Get Neon artifact
         uses: ./.github/actions/download
@@ -477,7 +481,7 @@ jobs:
             }"
 
   trigger-e2e-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
@@ -522,9 +526,10 @@ jobs:
             }"
 
   neon-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    # https://github.com/GoogleContainerTools/kaniko/issues/2005
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -540,12 +545,16 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-tools-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -558,11 +567,14 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    runs-on: [ self-hosted, gen3, large ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     needs: [ tag ]
     strategy:
       fail-fast: false
@@ -583,10 +595,13 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   vm-compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag, compute-node-image ]
     strategy:
       fail-fast: false
@@ -631,7 +646,7 @@ jobs:
 
   test-images:
     needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
 
     steps:
       - name: Checkout
@@ -673,20 +688,39 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     needs: [ tag, test-images, vm-compute-node-image ]
+    container: golang:1.19-bullseye
     if: github.event_name != 'workflow_dispatch'
-    container: amazon/aws-cli
-    strategy:
-      fail-fast: false
-      matrix:
-        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
 
     steps:
-      - name: Promote image to latest
+      - name: Install Crane & ECR helper
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
         run: |
-          export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
-          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Add latest tag to images
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
+        run: |
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   push-docker-hub:
     runs-on: [ self-hosted, dev, x64 ]
@@ -776,114 +810,11 @@ jobs:
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
-  calculate-deploy-targets:
-    runs-on: [ self-hosted, dev, x64 ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    outputs:
-      matrix-include: ${{ steps.set-matrix.outputs.include }}
-    steps:
-      - id: set-matrix
-        run: |
-          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
-            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
-            exit 1
-          fi
-
-  deploy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ eu-west-1, us-east-2 ]
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   deploy-pr-test-new:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -915,311 +846,40 @@ jobs:
           ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
-  deploy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
-    environment:
-      name: prod-${{ matrix.target_region }}
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
     steps:
       - name: Checkout
         uses: actions/checkout@v3
         with:
-          submodules: true
+          submodules: false
           fetch-depth: 0
 
-      - name: Redeploy
+      - name: Trigger deploy workflow
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
+            gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
+            gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             exit 1
           fi
 
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-proxy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-            deploy_link_proxy: true
-            deploy_legacy_scram_proxy: true
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: false
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy scram proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy link proxy
-        if: matrix.deploy_link_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy legacy scram proxy
-        if: matrix.deploy_legacy_scram_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-dev-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
   promote-compatibility-data:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
-    needs: [ deploy, deploy-proxy ]
+    needs: [ push-docker-hub, tag, regress-tests ]
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
     steps:
       - name: Promote compatibility snapshot for the release
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
new file mode 100644
index 0000000000..409517bf63
--- /dev/null
+++ b/.github/workflows/deploy-dev.yml
@@ -0,0 +1,179 @@
+name: Neon Deploy dev
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+concurrency:
+  group: deploy-dev
+  cancel-in-progress: false
+
+jobs:
+  deploy-storage-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+      options: --user root --privileged
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ eu-west-1, us-east-2 ]
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: true
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+  
+  deploy-storage-broker-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
new file mode 100644
index 0000000000..e1954b5540
--- /dev/null
+++ b/.github/workflows/deploy-prod.yml
@@ -0,0 +1,277 @@
+name: Neon Deploy prod
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+concurrency:
+  group: deploy-prod
+  cancel-in-progress: false
+
+jobs:
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: false
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: true
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+  deploy-storage-broker-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+  # Deploy to old account below          
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          eval $(ssh-agent)
+          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
+          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+          chmod 0600 ssh-key
+          ssh-add ssh-key
+          rm -f ssh-key ssh-key-cert.pub
+          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
+          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+
+  deploy-storage-broker:
+    name: deploy storage broker on old staging and old prod
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000000..49e04ee001
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,33 @@
+name: Create Release Branch
+
+on:
+  schedule:
+    - cron: '0 10 * * 2'
+
+jobs:
+  create_release_branch:
+    runs-on: [ubuntu-latest]
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v3
+      with:
+        ref: main
+
+    - name: Get current date
+      id: date
+      run: echo "date=(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+    - name: Create release branch
+      run: git checkout -b release/${{ steps.date.outputs.date }}
+
+    - name: Push new branch
+      run: git push origin release/${{ steps.date.outputs.date }}
+
+    - name: Create pull request into release
+      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
+      with:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        head: release/${{ steps.date.outputs.date }}
+        base: release
+        title: Release ${{ steps.date.outputs.date }}
diff --git a/Cargo.lock b/Cargo.lock
index d8aba9ba68..2985a654f3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -37,11 +37,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "amplify_num"
-version = "0.4.1"
-source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -66,6 +61,15 @@ dependencies = [
  "backtrace",
 ]
 
+[[package]]
+name = "archery"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
+dependencies = [
+ "static_assertions",
+]
+
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -137,15 +141,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "atomic-polyfill"
-version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -629,9 +624,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.11.1"
+version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
+checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
 
 [[package]]
 name = "byteorder"
@@ -750,13 +745,13 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.0.32"
+version = "4.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
+checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2"
 dependencies = [
  "bitflags",
  "clap_derive",
- "clap_lex 0.3.0",
+ "clap_lex 0.3.1",
  "is-terminal",
  "once_cell",
  "strsim",
@@ -765,9 +760,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.0.21"
+version = "4.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
+checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
 dependencies = [
  "heck",
  "proc-macro-error",
@@ -787,9 +782,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
 dependencies = [
  "os_str_bytes",
 ]
@@ -832,10 +827,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "futures",
  "hyper",
  "notify",
+ "opentelemetry",
  "postgres",
  "regex",
  "serde",
@@ -844,7 +840,9 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
+ "tracing-utils",
  "url",
  "workspace_hack",
 ]
@@ -887,7 +885,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "comfy-table",
  "git-version",
  "nix",
@@ -988,12 +986,6 @@ dependencies = [
  "itertools",
 ]
 
-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.6"
@@ -1030,12 +1022,11 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
+checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
 dependencies = [
  "cfg-if",
- "once_cell",
 ]
 
 [[package]]
@@ -1152,6 +1143,19 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "dashmap"
+version = "5.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.12.3",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.3.3"
@@ -1506,15 +1510,6 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
-[[package]]
-name = "hash32"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1530,19 +1525,6 @@ dependencies = [
  "ahash",
 ]
 
-[[package]]
-name = "heapless"
-version = "0.7.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.4",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.4.0"
@@ -1804,9 +1786,9 @@ dependencies = [
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
+checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
 dependencies = [
  "libc",
  "windows-sys",
@@ -1916,12 +1898,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "libm"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
-
 [[package]]
 name = "link-cplusplus"
 version = "1.0.8"
@@ -2067,9 +2043,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
 [[package]]
 name = "nix"
-version = "0.26.1"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694"
+checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2081,9 +2057,9 @@ dependencies = [
 
 [[package]]
 name = "nom"
-version = "7.1.2"
+version = "7.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
 dependencies = [
  "memchr",
  "minimal-lexical",
@@ -2154,7 +2130,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
  "autocfg",
- "libm",
 ]
 
 [[package]]
@@ -2203,6 +2178,108 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+dependencies = [
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+]
+
+[[package]]
+name = "opentelemetry-http"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "http",
+ "opentelemetry_api",
+ "reqwest",
+]
+
+[[package]]
+name = "opentelemetry-otlp"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
+dependencies = [
+ "async-trait",
+ "futures",
+ "futures-util",
+ "http",
+ "opentelemetry",
+ "opentelemetry-http",
+ "opentelemetry-proto",
+ "prost",
+ "reqwest",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
+dependencies = [
+ "futures",
+ "futures-util",
+ "opentelemetry",
+ "prost",
+ "tonic",
+ "tonic-build",
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
+dependencies = [
+ "opentelemetry",
+]
+
+[[package]]
+name = "opentelemetry_api"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
+dependencies = [
+ "fnv",
+ "futures-channel",
+ "futures-util",
+ "indexmap",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+dependencies = [
+ "async-trait",
+ "crossbeam-channel",
+ "dashmap",
+ "fnv",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "once_cell",
+ "opentelemetry_api",
+ "percent-encoding",
+ "rand",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "os_info"
 version = "3.5.1"
@@ -2230,14 +2307,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 name = "pageserver"
 version = "0.1.0"
 dependencies = [
- "amplify_num",
  "anyhow",
  "async-stream",
  "async-trait",
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -2269,7 +2345,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest",
- "rstar",
+ "rpds",
  "scopeguard",
  "serde",
  "serde_json",
@@ -2581,9 +2657,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.49"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
+checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
 dependencies = [
  "unicode-ident",
 ]
@@ -2683,7 +2759,7 @@ dependencies = [
  "bstr",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -2742,14 +2818,13 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
  "libc",
  "rand_chacha",
  "rand_core",
- "rand_hc",
 ]
 
 [[package]]
@@ -2771,15 +2846,6 @@ dependencies = [
  "getrandom",
 ]
 
-[[package]]
-name = "rand_hc"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
-dependencies = [
- "rand_core",
-]
-
 [[package]]
 name = "rayon"
 version = "1.6.1"
@@ -2930,7 +2996,7 @@ dependencies = [
  "cc",
  "libc",
  "once_cell",
- "spin 0.5.2",
+ "spin",
  "untrusted",
  "web-sys",
  "winapi",
@@ -2950,14 +3016,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "rstar"
-version = "0.9.3"
+name = "rpds"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
+checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
 dependencies = [
- "heapless",
- "num-traits",
- "smallvec",
+ "archery",
 ]
 
 [[package]]
@@ -3018,9 +3082,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.36.6"
+version = "0.36.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
+checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
 dependencies = [
  "bitflags",
  "errno",
@@ -3093,7 +3157,7 @@ dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "crc32c",
  "fs2",
@@ -3479,21 +3543,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
-[[package]]
-name = "spin"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
-dependencies = [
- "lock_api",
-]
-
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -3507,7 +3556,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "futures",
  "futures-core",
@@ -3639,9 +3688,9 @@ dependencies = [
 
 [[package]]
 name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
 dependencies = [
  "winapi-util",
 ]
@@ -3749,9 +3798,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.24.1"
+version = "1.24.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
+checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb"
 dependencies = [
  "autocfg",
  "bytes",
@@ -4071,6 +4120,20 @@ dependencies = [
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+dependencies = [
+ "once_cell",
+ "opentelemetry",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-serde"
 version = "0.1.3"
@@ -4102,6 +4165,22 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "tracing-utils"
+version = "0.1.0"
+dependencies = [
+ "hyper",
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "opentelemetry-semantic-conventions",
+ "reqwest",
+ "tokio",
+ "tracing",
+ "tracing-opentelemetry",
+ "tracing-subscriber",
+ "workspace_hack",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.4"
@@ -4183,9 +4262,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
 
 [[package]]
 name = "ureq"
-version = "2.6.1"
+version = "2.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566"
+checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d"
 dependencies = [
  "base64 0.13.1",
  "log",
@@ -4226,6 +4305,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
+ "atty",
  "bincode",
  "byteorder",
  "bytes",
@@ -4287,7 +4367,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "env_logger",
  "log",
  "once_cell",
@@ -4534,11 +4614,13 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "crossbeam-utils",
  "either",
  "fail",
+ "futures",
  "futures-channel",
+ "futures-executor",
  "futures-task",
  "futures-util",
  "indexmap",
@@ -4554,6 +4636,9 @@ dependencies = [
  "rand",
  "regex",
  "regex-syntax",
+ "reqwest",
+ "ring",
+ "rustls",
  "scopeguard",
  "serde",
  "serde_json",
@@ -4561,6 +4646,7 @@ dependencies = [
  "syn",
  "tokio",
  "tokio-util",
+ "tonic",
  "tower",
  "tracing",
  "tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index 74cc16d690..e6695c4246 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,10 @@ nix = "0.26"
 notify = "5.0.0"
 num-traits = "0.2.15"
 once_cell = "1.13"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
+tracing-opentelemetry = "0.18.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -69,7 +73,7 @@ rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 routerify = "3"
-rstar = "0.9.3"
+rpds = "0.12.0"
 rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
@@ -107,9 +111,6 @@ x509-parser = "0.14"
 env_logger = "0.10"
 log = "0.4"
 
-## TODO switch when the new release is made
-amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
-
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
@@ -128,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
+tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 
 ## Common library dependency
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node
similarity index 86%
rename from Dockerfile.compute-node-v14
rename to Dockerfile.compute-node
index 2deb95a93f..936f368833 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node
@@ -1,8 +1,5 @@
-#
-# This file is identical to the Dockerfile.compute-node-v15 file
-# except for the version of Postgres that is built.
-#
-
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=rust
 ARG TAG=pinned
 
 #########################################################################################
@@ -22,7 +19,8 @@ RUN apt update &&  \
 #
 #########################################################################################
 FROM build-deps AS pg-build
-COPY vendor/postgres-v14 postgres
+ARG PG_VERSION
+COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
     ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
@@ -135,6 +133,27 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
+#
+# Layer "unit-pg-build"
+# compile unit extension
+#
+#########################################################################################
+FROM build-deps AS unit-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
+    tar xvzf 7.7.tar.gz && \
+    cd postgresql-unit-7.7 && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
+    # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
+    # This one-liner removes pgsql/ part of the path.
+    # NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
+    find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -146,6 +165,7 @@ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
+COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -158,7 +178,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 # Compile and run the Neon-specific `compute_ctl` binary
 #
 #########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
deleted file mode 100644
index 8647ce2bf4..0000000000
--- a/Dockerfile.compute-node-v15
+++ /dev/null
@@ -1,220 +0,0 @@
-#
-# This file is identical to the Dockerfile.compute-node-v14 file
-# except for the version of Postgres that is built.
-#
-
-ARG TAG=pinned
-
-#########################################################################################
-#
-# Layer "build-deps"
-#
-#########################################################################################
-FROM debian:bullseye-slim AS build-deps
-RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
-
-#########################################################################################
-#
-# Layer "pg-build"
-# Build Postgres from the neon postgres repository.
-#
-#########################################################################################
-FROM build-deps AS pg-build
-COPY vendor/postgres-v15 postgres
-RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
-    # Enable some of contrib extensions
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
-
-#########################################################################################
-#
-# Layer "postgis-build"
-# Build PostGIS from the upstream PostGIS mirror.
-#
-#########################################################################################
-FROM build-deps AS postgis-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
-
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
-    ./autogen.sh && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    ./configure && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    cd extensions/postgis && \
-    make clean && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
-
-#########################################################################################
-#
-# Layer "plv8-build"
-# Build plv8
-#
-#########################################################################################
-FROM build-deps AS plv8-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
-
-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
-    rm -rf /plv8-* && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
-
-#########################################################################################
-#
-# Layer "h3-pg-build"
-# Build h3_pg
-#
-#########################################################################################
-FROM build-deps AS h3-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-      -q -O /tmp/cmake-install.sh \
-      && chmod u+x /tmp/cmake-install.sh \
-      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
-      && rm /tmp/cmake-install.sh
-
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/h3 make install && \
-    cp -R /h3/usr / && \
-    rm -rf build
-
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
-
-#########################################################################################
-#
-# Layer "neon-pg-ext-build"
-# compile neon extensions
-#
-#########################################################################################
-FROM build-deps AS neon-pg-ext-build
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /h3/usr /
-COPY pgxn/ pgxn/
-
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon \
-        -s install
-
-#########################################################################################
-#
-# Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Clean up postgres folder before inclusion
-#
-#########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
-
-# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
-
-# Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
-
-# Remove static postgresql libraries - all compilation is finished, so we
-# can now remove these files - they must be included in other binaries by now
-# if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
-
-#########################################################################################
-#
-# Final layer
-# Put it all together into the final image
-#
-#########################################################################################
-FROM debian:bullseye-slim
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute && \
-    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-
-# Install:
-# libreadline8 for psql
-# libossp-uuid16 for extension ossp-uuid
-# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-RUN apt update &&  \
-    apt install --no-install-recommends -y \
-        libreadline8 \
-        libossp-uuid16 \
-        libgeos-c1v5 \
-        libgdal28 \
-        libproj19 \
-        libprotobuf-c1 \
-        gdb && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-USER postgres
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 4536604bdf..f8c3481f57 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ clap.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
+opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
@@ -19,7 +20,9 @@ tar.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tracing.workspace = true
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 url.workspace = true
 
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e5ab8eb153..2c42662020 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -53,7 +53,7 @@ use compute_tools::spec::*;
 use url::Url;
 
 fn main() -> Result<()> {
-    init_logger(DEFAULT_LOG_LEVEL)?;
+    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let matches = cli().get_matches();
 
@@ -84,6 +84,29 @@ fn main() -> Result<()> {
         }
     };
 
+    // Extract OpenTelemetry context for the startup actions from the spec, and
+    // attach it to the current tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
+        Some(TraceContextPropagator::new().extract(carrier).attach())
+    } else {
+        None
+    };
+
     let pageserver_connstr = spec
         .cluster
         .settings
@@ -140,6 +163,9 @@ fn main() -> Result<()> {
     // Wait for the child Postgres process forever. In this state Ctrl+C will
     // propagate to Postgres and it will be shut down as well.
     if let Some(mut pg) = pg {
+        // Startup is finished, exit the startup tracing span
+        drop(startup_context_guard);
+
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
@@ -159,6 +185,10 @@ fn main() -> Result<()> {
         info!("shutting down");
     }
 
+    // Shutdown trace pipeline gracefully, so that it has a chance to send any
+    // pending traces before we exit.
+    tracing_utils::shutdown_tracing();
+
     exit(exit_code.unwrap_or(1))
 }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f2a49f332c..589a8e1434 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,16 +3,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::ComputeNode;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use serde_json;
 use tracing::{error, info};
-
-use crate::compute::ComputeNode;
+use tracing_utils::http::OtelName;
 
 // Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
+async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
+    //
+    // NOTE: The URI path is currently included in traces. That's OK because
+    // it doesn't contain any variable parts or sensitive information. But
+    // please keep that in mind if you change the routing here.
+    //
     match (req.method(), req.uri().path()) {
         // Serialized compute state.
         (&Method::GET, "/status") => {
@@ -30,7 +35,7 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
 
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
-            let res = crate::checker::check_writability(&compute).await;
+            let res = crate::checker::check_writability(compute).await;
             match res {
                 Ok(_) => Response::new(Body::from("true")),
                 Err(e) => Response::new(Body::from(e.to_string())),
@@ -56,7 +61,19 @@ async fn serve(state: Arc<ComputeNode>) {
         async move {
             Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
                 let state = state.clone();
-                async move { Ok::<_, Infallible>(routes(req, state).await) }
+                async move {
+                    Ok::<_, Infallible>(
+                        // NOTE: We include the URI path in the string. It
+                        // doesn't contain any variable parts or sensitive
+                        // information in this API.
+                        tracing_utils::http::tracing_handler(
+                            req,
+                            |req| routes(req, &state),
+                            OtelName::UriPath,
+                        )
+                        .await,
+                    )
+                }
             }))
         }
     });
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 57e5496e86..1b5cf647b0 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,21 +1,37 @@
-use anyhow::Result;
+use tracing_opentelemetry::OpenTelemetryLayer;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;
 
-/// Initialize `env_logger` using either `default_level` or
+/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
+///
+/// Logging is configured using either `default_log_level` or
 /// `RUST_LOG` environment variable as default log level.
-pub fn init_logger(default_level: &str) -> Result<()> {
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the destination,
+/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
+/// `tracing-utils` package description.
+///
+pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+    // Initialize Logging
     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_target(false)
         .with_writer(std::io::stderr);
 
+    // Initialize OpenTelemetry
+    let otlp_layer =
+        tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
+
+    // Put it all together
     tracing_subscriber::registry()
         .with(env_filter)
+        .with(otlp_layer)
         .with(fmt_layer)
         .init();
+    tracing::info!("logging and tracing started");
 
     Ok(())
 }
diff --git a/compute_tools/src/params.rs b/compute_tools/src/params.rs
index 925a2f8ef3..0ce01ff478 100644
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -1,3 +1,9 @@
 pub const DEFAULT_LOG_LEVEL: &str = "info";
-pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+// From Postgres docs:
+//   To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
+//   as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
+//   (see below), then SCRAM-based authentication will automatically be chosen instead.
+//   https://www.postgresql.org/docs/15/auth-password.html
+//
+// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
 pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 97cd623052..bbd0ec21ed 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::path::Path;
 use std::str::FromStr;
 
@@ -22,6 +23,8 @@ pub struct ComputeSpec {
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
+
+    pub startup_tracing_context: Option<HashMap<String, String>>,
 }
 
 /// Cluster state seen from the perspective of the external tools
@@ -152,8 +155,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             {
                 RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
-                // Check whether password changed or not (trim 'md5:' prefix first)
-                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                // Check whether password changed or not (trim 'md5' prefix first if any)
+                //
+                // This is a backward compatibility hack, which comes from the times when we were using
+                // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
+                // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
+                // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
+                // Here is the only place so far where we compare hashes, so it seems to be the best candidate
+                // to place this compatibility layer.
+                let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
+                    stripped
+                } else {
+                    pg_pwd
+                };
+                if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
                     RoleAction::Update
                 } else {
                     RoleAction::None
@@ -372,13 +387,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                     name.pg_quote(),
                     db.owner.pg_quote()
                 );
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
             DatabaseAction::Create => {
                 let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
                 query.push_str(&db.to_pg_options());
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
         };
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 880ab0e83c..07d220195b 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -8,6 +8,7 @@ pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
+pub use prometheus::{register_counter_vec, Counter, CounterVec};
 pub use prometheus::{register_gauge, Gauge};
 pub use prometheus::{register_gauge_vec, GaugeVec};
 pub use prometheus::{register_histogram, Histogram};
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b5027cb331..0d7aa2db55 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,6 +29,14 @@ pub enum TenantState {
     Broken,
 }
 
+pub mod state {
+    pub const LOADING: &str = "loading";
+    pub const ATTACHING: &str = "attaching";
+    pub const ACTIVE: &str = "active";
+    pub const STOPPING: &str = "stopping";
+    pub const BROKEN: &str = "broken";
+}
+
 impl TenantState {
     pub fn has_in_progress_downloads(&self) -> bool {
         match self {
@@ -39,23 +47,32 @@ impl TenantState {
             Self::Broken => false,
         }
     }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            TenantState::Loading => state::LOADING,
+            TenantState::Attaching => state::ATTACHING,
+            TenantState::Active => state::ACTIVE,
+            TenantState::Stopping => state::STOPPING,
+            TenantState::Broken => state::BROKEN,
+        }
+    }
 }
 
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
-    /// Timeline is fully operational. If the containing Tenant is Active, the timeline's
-    /// background jobs are running otherwise they will be launched when the tenant is activated.
+    /// The timeline is recognized by the pageserver but is not yet operational.
+    /// In particular, the walreceiver connection loop is not running for this timeline.
+    /// It will eventually transition to state Active or Broken.
+    Loading,
+    /// The timeline is fully operational.
+    /// It can be queried, and the walreceiver connection loop is running.
     Active,
-    /// A timeline is recognized by pageserver, but not yet ready to operate.
-    /// The status indicates, that the timeline could eventually go back to Active automatically:
-    /// for example, if the owning tenant goes back to Active again.
-    Suspended,
-    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
-    /// automatically become Active after certain events: only a management call can change this status.
+    /// The timeline was previously Loading or Active but is shutting down.
+    /// It cannot transition back into any other state.
     Stopping,
-    /// A timeline is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The timeline is broken and not operational (previous states: Loading or Active).
     Broken,
 }
 
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
new file mode 100644
index 0000000000..8c3d3f9063
--- /dev/null
+++ b/libs/tracing-utils/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "tracing-utils"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+hyper.workspace = true
+opentelemetry = { workspace = true, features=["rt-tokio"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
+tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
+tracing.workspace = true
+tracing-opentelemetry.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
new file mode 100644
index 0000000000..3f80f49de1
--- /dev/null
+++ b/libs/tracing-utils/src/http.rs
@@ -0,0 +1,96 @@
+//! Tracing wrapper for Hyper HTTP server
+
+use hyper::HeaderMap;
+use hyper::{Body, Request, Response};
+use std::future::Future;
+use tracing::Instrument;
+use tracing_opentelemetry::OpenTelemetrySpanExt;
+
+/// Configuration option for what to use as the "otel.name" field in the traces.
+pub enum OtelName<'a> {
+    /// Use a constant string
+    Constant(&'a str),
+
+    /// Use the path from the request.
+    ///
+    /// That's very useful information, but is not appropriate if the
+    /// path contains parameters that differ on ever request, or worse,
+    /// sensitive information like usernames or email addresses.
+    ///
+    /// See <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md#name>
+    UriPath,
+}
+
+/// Handle an incoming HTTP request using the given handler function,
+/// with OpenTelemetry tracing.
+///
+/// This runs 'handler' on the request in a new span, with fields filled in
+/// from the request. Notably, if the request contains tracing information,
+/// it is propagated to the span, so that this request is traced as part of
+/// the same trace.
+///
+/// XXX: Usually, this is handled by existing libraries, or built
+/// directly into HTTP servers. However, I couldn't find one for Hyper,
+/// so I had to write our own. OpenTelemetry website has a registry of
+/// instrumentation libraries at:
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
+/// If a Hyper crate appears, consider switching to that.
+pub async fn tracing_handler<F, R>(
+    req: Request<Body>,
+    handler: F,
+    otel_name: OtelName<'_>,
+) -> Response<Body>
+where
+    F: Fn(Request<Body>) -> R,
+    R: Future<Output = Response<Body>>,
+{
+    // Create a tracing span, with context propagated from the incoming
+    // request if any.
+    //
+    // See list of standard fields defined for HTTP requests at
+    // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md
+    // We only fill in a few of the most useful ones here.
+    let otel_name = match otel_name {
+        OtelName::Constant(s) => s,
+        OtelName::UriPath => req.uri().path(),
+    };
+
+    let span = tracing::info_span!(
+        "http request",
+        otel.name= %otel_name,
+        http.method = %req.method(),
+        http.status_code = tracing::field::Empty,
+    );
+    let parent_ctx = extract_remote_context(req.headers());
+    span.set_parent(parent_ctx);
+
+    // Handle the request within the span
+    let response = handler(req).instrument(span.clone()).await;
+
+    // Fill in the fields from the response code
+    let status = response.status();
+    span.record("http.status_code", status.as_str());
+    span.record(
+        "otel.status_code",
+        if status.is_success() { "OK" } else { "ERROR" },
+    );
+
+    response
+}
+
+// Extract remote tracing context from the HTTP headers
+fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
+    struct HeaderExtractor<'a>(&'a HeaderMap);
+
+    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+        fn get(&self, key: &str) -> Option<&str> {
+            self.0.get(key).and_then(|value| value.to_str().ok())
+        }
+
+        fn keys(&self) -> Vec<&str> {
+            self.0.keys().map(|value| value.as_str()).collect()
+        }
+    }
+    let extractor = HeaderExtractor(headers);
+    opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor))
+}
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
new file mode 100644
index 0000000000..de0e2ad799
--- /dev/null
+++ b/libs/tracing-utils/src/lib.rs
@@ -0,0 +1,168 @@
+//! Helper functions to set up OpenTelemetry tracing.
+//!
+//! This comes in two variants, depending on whether you have a Tokio runtime available.
+//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
+//! the current tokio runtime. If you don't have a runtime available, or you don't want
+//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
+//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
+//!
+//! Example:
+//!
+//! ```rust,no_run
+//! use tracing_subscriber::prelude::*;
+//! use tracing_opentelemetry::OpenTelemetryLayer;
+//!
+//! #[tokio::main]
+//! async fn main() {
+//!     // Set up logging to stderr
+//!     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+//!         .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
+//!     let fmt_layer = tracing_subscriber::fmt::layer()
+//!         .with_target(false)
+//!         .with_writer(std::io::stderr);
+//!
+//!     // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
+//!     let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
+//!
+//!     // Put it all together
+//!     tracing_subscriber::registry()
+//!         .with(env_filter)
+//!         .with(otlp_layer)
+//!         .with(fmt_layer)
+//!         .init();
+//! }
+//! ```
+
+use opentelemetry::sdk::Resource;
+use opentelemetry::KeyValue;
+use opentelemetry_otlp::WithExportConfig;
+use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
+
+pub use tracing_opentelemetry::OpenTelemetryLayer;
+
+pub mod http;
+
+/// Set up OpenTelemetry exporter, using configuration from environment variables.
+///
+/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
+/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md#service>)
+///
+/// We try to follow the conventions for the environment variables specified in
+/// <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables/>
+///
+/// However, we only support a subset of those options:
+///
+/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing
+///   is enabled by default. Set it to "true" to disable.
+///
+/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_*
+///   settings specified in
+///   <https://opentelemetry.io/docs/reference/specification/protocol/exporter/>
+///   are supported, as they are handled by the `opentelemetry-otlp` crate.
+///   Settings related to other exporters have no effect.
+///
+/// - Some other settings are supported by the `opentelemetry` crate.
+///
+/// If you need some other setting, please test if it works first. And perhaps
+/// add a comment in the list above to save the effort of testing for the next
+/// person.
+///
+/// This doesn't block, but is marked as 'async' to hint that this must be called in
+/// asynchronous execution context.
+pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
+/// tasks.
+pub fn init_tracing_without_runtime(
+    service_name: &str,
+) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+
+    // The opentelemetry batch processor and the OTLP exporter needs a Tokio
+    // runtime. Create a dedicated runtime for them. One thread should be
+    // enough.
+    //
+    // (Alternatively, instead of batching, we could use the "simple
+    // processor", which doesn't need Tokio, and use "reqwest-blocking"
+    // feature for the OTLP exporter, which also doesn't need Tokio.  However,
+    // batching is considered best practice, and also I have the feeling that
+    // the non-Tokio codepaths in the opentelemetry crate are less used and
+    // might be more buggy, so better to stay on the well-beaten path.)
+    //
+    // We leak the runtime so that it keeps running after we exit the
+    // function.
+    let runtime = Box::leak(Box::new(
+        tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .thread_name("otlp runtime thread")
+            .worker_threads(1)
+            .build()
+            .unwrap(),
+    ));
+    let _guard = runtime.enter();
+
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
+    // Set up exporter from the OTEL_EXPORTER_* environment variables
+    let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
+
+    // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
+    // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
+    // OpenTelemetry spec at
+    // <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
+    // the full exporter URL is formed by appending "/v1/traces" to the value
+    // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
+    // that with the grpc-tonic exporter. Other exporters, like the HTTP
+    // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
+    // appending "/v1/traces".
+    //
+    // See https://github.com/open-telemetry/opentelemetry-rust/pull/950
+    //
+    // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
+    // the endpoint url with the "/v1/traces" path ourselves. If the bug is
+    // fixed in a later version, we can remove this code. But if we don't
+    // remember to remove this, it won't do any harm either, as the crate will
+    // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
+    // is set directly with `with_endpoint`.
+    if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
+        if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
+            if !endpoint.ends_with('/') {
+                endpoint.push('/');
+            }
+            endpoint.push_str("v1/traces");
+            exporter = exporter.with_endpoint(endpoint);
+        }
+    }
+
+    // Propagate trace information in the standard W3C TraceContext format.
+    opentelemetry::global::set_text_map_propagator(
+        opentelemetry::sdk::propagation::TraceContextPropagator::new(),
+    );
+
+    opentelemetry_otlp::new_pipeline()
+        .tracing()
+        .with_exporter(exporter)
+        .with_trace_config(
+            opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
+                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+                service_name,
+            )])),
+        )
+        .install_batch(opentelemetry::runtime::Tokio)
+        .expect("could not initialize opentelemetry exporter")
+}
+
+// Shutdown trace pipeline gracefully, so that it has a chance to send any
+// pending traces before we exit.
+pub fn shutdown_tracing() {
+    opentelemetry::global::shutdown_tracer_provider();
+}
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 020e4d9dd7..1f6c96bdbe 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index b0ecb746d9..1ba0422993 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,6 +1,7 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
+use tracing::error;
 
 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -76,8 +77,16 @@ impl HttpErrorBody {
 }
 
 pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    tracing::error!("Error processing HTTP request: {:?}", err);
-    err.downcast::<ApiError>()
-        .expect("handler should always return api error")
-        .into_response()
+    let api_error = err
+        .downcast::<ApiError>()
+        .expect("handler should always return api error");
+
+    // Print a stack trace for Internal Server errors
+    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+        error!("Error processing HTTP request: {api_error:?}");
+    } else {
+        error!("Error processing HTTP request: {api_error:#}");
+    }
+
+    api_error.into_response()
 }
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 82c9267f4a..02684d3d16 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     let base_logger = tracing_subscriber::fmt()
         .with_env_filter(env_filter)
         .with_target(false)
-        .with_ansi(false)
+        .with_ansi(atty::is(atty::Stream::Stdout))
         .with_writer(std::io::stdout);
 
     match log_format {
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index cb9e4478bf..66c25e8576 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -11,7 +11,6 @@ default = []
 testing = ["fail/failpoints"]
 
 [dependencies]
-amplify_num.workspace = true
 anyhow.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -41,7 +40,6 @@ postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
 regex.workspace = true
-rstar.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
@@ -68,6 +66,7 @@ tenant_size_model.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 reqwest.workspace = true
+rpds.workspace = true
 
 [dev-dependencies]
 criterion.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6a01fdfc6f..e18c00da96 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,13 +1,12 @@
-use anyhow::Result;
+use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
-use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
+use pageserver::tenant::storage_layer::Layer;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
-use std::ops::Range;
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -17,102 +16,35 @@ use utils::lsn::Lsn;
 
 use criterion::{criterion_group, criterion_main, Criterion};
 
-struct DummyDelta {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-}
-
-impl Layer for DummyDelta {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-struct DummyImage {
-    key_range: Range<Key>,
-    lsn: Lsn,
-}
-
-impl Layer for DummyImage {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
-    let mut layer_map = LayerMap::<dyn Layer>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+    let mut layer_map = LayerMap::<LayerDescriptor>::default();
 
     let mut min_lsn = Lsn(u64::MAX);
     let mut max_lsn = Lsn(0);
 
     let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
 
+    let mut updates = layer_map.batch_update();
     for fname in filenames {
         let fname = &fname.unwrap();
         if let Some(imgfilename) = ImageFileName::parse_str(fname) {
-            let layer = DummyImage {
-                key_range: imgfilename.key_range,
-                lsn: imgfilename.lsn,
+            let layer = LayerDescriptor {
+                key: imgfilename.key_range,
+                lsn: imgfilename.lsn..(imgfilename.lsn + 1),
+                is_incremental: false,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, imgfilename.lsn);
             max_lsn = max(max_lsn, imgfilename.lsn);
         } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
-            let layer = DummyDelta {
-                key_range: deltafilename.key_range,
-                lsn_range: deltafilename.lsn_range.clone(),
+            let layer = LayerDescriptor {
+                key: deltafilename.key_range.clone(),
+                lsn: deltafilename.lsn_range.clone(),
+                is_incremental: true,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, deltafilename.lsn_range.start);
             max_lsn = max(max_lsn, deltafilename.lsn_range.end);
         } else {
@@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
 
     println!("min: {min_lsn}, max: {max_lsn}");
 
+    updates.flush();
     layer_map
 }
 
 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
     // For each image layer we query one of the pages contained, at LSN right
     // before the image layer was created. This gives us a somewhat uniform
     // coverage of both the lsn and key space because image layers have
@@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
         .collect()
 }
 
+// Construct a partitioning for testing get_difficulty map when we
+// don't have an exact result of `collect_keyspace` to work with.
+fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+    let mut parts = Vec::new();
+
+    // We add a partition boundary at the start of each image layer,
+    // no matter what lsn range it covers. This is just the easiest
+    // thing to do. A better thing to do would be to get a real
+    // partitioning from some database. Even better, remove the need
+    // for key partitions by deciding where to create image layers
+    // directly based on a coverage-based difficulty map.
+    let mut keys: Vec<_> = layer_map
+        .iter_historic_layers()
+        .filter_map(|l| {
+            if l.is_incremental() {
+                None
+            } else {
+                let kr = l.get_key_range();
+                Some(kr.start.next())
+            }
+        })
+        .collect();
+    keys.sort();
+
+    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
+    for key in keys {
+        parts.push(KeySpace {
+            ranges: vec![current_key..key],
+        });
+        current_key = key;
+    }
+
+    KeyPartitioning { parts }
+}
+
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) {
 // Benchmark using metadata extracted from a real project that was taknig
 // too long processing layer map queries.
 fn bench_from_real_project(c: &mut Criterion) {
-    // TODO consider compressing this file
+    // Init layer map
+    let now = Instant::now();
     let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    println!("Finished layer map init in {:?}", now.elapsed());
+
+    // Choose uniformly distributed queries
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
-    // Test with uniform query pattern
-    c.bench_function("real_map_uniform_queries", |b| {
+    // Choose inputs for get_difficulty_map
+    let latest_lsn = layer_map
+        .iter_historic_layers()
+        .map(|l| l.get_lsn_range().end)
+        .max()
+        .unwrap();
+    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
+
+    // Check correctness of get_difficulty_map
+    // TODO put this in a dedicated test outside of this mod
+    {
+        println!("running correctness check");
+
+        let now = Instant::now();
+        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
+        assert!(result_bruteforce.len() == partitioning.parts.len());
+        println!("Finished bruteforce in {:?}", now.elapsed());
+
+        let now = Instant::now();
+        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
+        assert!(result_fast.len() == partitioning.parts.len());
+        println!("Finished fast in {:?}", now.elapsed());
+
+        // Assert results are equal. Manually iterate for easier debugging.
+        let zip = std::iter::zip(
+            &partitioning.parts,
+            std::iter::zip(result_bruteforce, result_fast),
+        );
+        for (_part, (bruteforce, fast)) in zip {
+            assert_eq!(bruteforce, fast);
+        }
+
+        println!("No issues found");
+    }
+
+    // Define and name the benchmark function
+    let mut group = c.benchmark_group("real_map");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.bench_function("get_difficulty_map", |b| {
+        b.iter(|| {
+            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
+        });
+    });
+    group.finish();
 }
 
 // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
 fn bench_sequential(c: &mut Criterion) {
-    let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
-
     // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
     //
     // TODO This code is pretty slow and runs even if we're only running other
@@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) {
     //      Putting it inside the `bench_function` closure is not a solution
     //      because then it runs multiple times during warmup.
     let now = Instant::now();
+    let mut layer_map = LayerMap::default();
+    let mut updates = layer_map.batch_update();
     for i in 0..100_000 {
-        // TODO try inserting a super-wide layer in between every 10 to reflect
-        //      what often happens with L1 layers that include non-rel changes.
-        //      Maybe do that as a separate test.
         let i32 = (i as u32) % 100;
         let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = DummyImage {
-            key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(10 * i),
+        let layer = LayerDescriptor {
+            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            lsn: Lsn(i)..Lsn(i + 1),
+            is_incremental: false,
+            short_id: format!("Layer {}", i),
         };
-        layer_map.insert_historic(Arc::new(layer));
+        updates.insert_historic(Arc::new(layer));
     }
-
-    // Manually measure runtime without criterion because criterion
-    // has a minimum sample size of 10 and I don't want to run it 10 times.
-    println!("Finished init in {:?}", now.elapsed());
+    updates.flush();
+    println!("Finished layer map init in {:?}", now.elapsed());
 
     // Choose 100 uniformly random queries
     let rng = &mut StdRng::seed_from_u64(1);
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
-        .choose_multiple(rng, 1)
+        .choose_multiple(rng, 100)
         .copied()
         .collect();
 
     // Define and name the benchmark function
-    c.bench_function("sequential_uniform_queries", |b| {
-        // Run the search queries
+    let mut group = c.benchmark_group("sequential");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.finish();
 }
 
 criterion_group!(group_1, bench_from_captest_env);
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index f1d92ac36b..06d4853274 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -27,6 +27,7 @@ use tracing::*;
 ///
 use tokio_tar::{Builder, EntryType, Header};
 
+use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -52,6 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     req_lsn: Option<Lsn>,
     prev_lsn: Option<Lsn>,
     full_backup: bool,
+    ctx: &'a RequestContext,
 ) -> anyhow::Result<()>
 where
     W: AsyncWrite + Send + Sync + Unpin,
@@ -110,6 +112,7 @@ where
         lsn: backup_lsn,
         prev_record_lsn: prev_lsn,
         full_backup,
+        ctx,
     };
     basebackup
         .send_tarball()
@@ -129,6 +132,7 @@ where
     lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
+    ctx: &'a RequestContext,
 }
 
 impl<'a, W> Basebackup<'a, W>
@@ -171,23 +175,37 @@ where
             SlruKind::MultiXactOffsets,
             SlruKind::MultiXactMembers,
         ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn).await? {
+            for segno in self
+                .timeline
+                .list_slru_segments(kind, self.lsn, self.ctx)
+                .await?
+            {
                 self.add_slru_segment(kind, segno).await?;
             }
         }
 
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn).await? {
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn).await? {
+                for rel in self
+                    .timeline
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .await?
+                {
                     self.add_rel(rel).await?;
                 }
             }
         }
-        for xid in self.timeline.list_twophase_files(self.lsn).await? {
+        for xid in self
+            .timeline
+            .list_twophase_files(self.lsn, self.ctx)
+            .await?
+        {
             self.add_twophase_file(xid).await?;
         }
 
@@ -203,7 +221,10 @@ where
     }
 
     async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false).await?;
+        let nblocks = self
+            .timeline
+            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
@@ -223,7 +244,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
@@ -245,14 +266,14 @@ where
     async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_slru_segment_size(slru, segno, self.lsn)
+            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
             .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
             let img = self
                 .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
                 .await?;
 
             if slru == SlruKind::Clog {
@@ -287,7 +308,7 @@ where
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn)
+                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                 .await?;
             ensure!(img.len() == 512);
             Some(img)
@@ -323,7 +344,7 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                     .await?
                     .is_empty()
             {
@@ -356,7 +377,10 @@ where
     // Extract twophase state files
     //
     async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn).await?;
+        let img = self
+            .timeline
+            .get_twophase_file(xid, self.lsn, self.ctx)
+            .await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -394,12 +418,12 @@ where
 
         let checkpoint_bytes = self
             .timeline
-            .get_checkpoint(self.lsn)
+            .get_checkpoint(self.lsn, self.ctx)
             .await
             .context("failed to get checkpoint bytes")?;
         let pg_control_bytes = self
             .timeline
-            .get_control_file(self.lsn)
+            .get_control_file(self.lsn, self.ctx)
             .await
             .context("failed get control bytes")?;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5de6e4def5..f2cd93bd3a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -13,6 +13,7 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
+    context::{DownloadBehavior, RequestContext},
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
@@ -26,7 +27,7 @@ use utils::{
     logging,
     postgres_backend::AuthType,
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals::{self, Signal},
     tcp_listener,
 };
@@ -85,7 +86,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.id.to_string())],
+    );
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
@@ -246,7 +250,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     let signals = signals::install_shutdown_handlers()?;
 
     // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
+    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
 
     // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
@@ -325,6 +329,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let metrics_ctx = RequestContext::todo_child(
+                TaskKind::MetricsCollection,
+                // This task itself shouldn't download anything.
+                // The actual size calculation does need downloads, and
+                // creates a child context with the right DownloadBehavior.
+                DownloadBehavior::Error,
+            );
             task_mgr::spawn(
                 MGMT_REQUEST_RUNTIME.handle(),
                 TaskKind::MetricsCollection,
@@ -338,6 +349,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                         conf.metric_collection_interval,
                         conf.synthetic_size_calculation_interval,
                         conf.id,
+                        metrics_ctx,
                     )
                     .instrument(info_span!("metrics_collection"))
                     .await?;
@@ -349,17 +361,34 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    task_mgr::spawn(
-        COMPUTE_REQUEST_RUNTIME.handle(),
-        TaskKind::LibpqEndpointListener,
-        None,
-        None,
-        "libpq endpoint listener",
-        true,
-        async move {
-            page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
-        },
-    );
+    {
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            COMPUTE_REQUEST_RUNTIME.handle(),
+            TaskKind::LibpqEndpointListener,
+            None,
+            None,
+            "libpq endpoint listener",
+            true,
+            async move {
+                page_service::libpq_listener_main(
+                    conf,
+                    auth,
+                    pageserver_listener,
+                    conf.auth_type,
+                    libpq_ctx,
+                )
+                .await
+            },
+        );
+    }
 
     // All started up! Now just sit and wait for shutdown signal.
     signals.handle(|signal| match signal {
diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs
new file mode 100644
index 0000000000..6c92967ca3
--- /dev/null
+++ b/pageserver/src/broker_client.rs
@@ -0,0 +1,48 @@
+//! The broker client instance of the pageserver, created during pageserver startup.
+//! Used by each timelines' [`walreceiver`].
+
+use crate::config::PageServerConf;
+
+use anyhow::Context;
+use once_cell::sync::OnceCell;
+use storage_broker::BrokerClientChannel;
+use tracing::*;
+
+static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
+
+///
+/// Initialize the broker client. This must be called once at page server startup.
+///
+pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    let broker_endpoint = conf.broker_endpoint.clone();
+
+    // Note: we do not attempt connecting here (but validate endpoints sanity).
+    let broker_client =
+        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+            format!(
+                "Failed to create broker client to {}",
+                &conf.broker_endpoint
+            ),
+        )?;
+
+    if BROKER_CLIENT.set(broker_client).is_err() {
+        panic!("broker already initialized");
+    }
+
+    info!(
+        "Initialized broker client with endpoints: {}",
+        broker_endpoint
+    );
+    Ok(())
+}
+
+///
+/// Get a handle to the broker client
+///
+pub fn get_broker_client() -> &'static BrokerClientChannel {
+    BROKER_CLIENT.get().expect("broker client not initialized")
+}
+
+pub fn is_broker_client_initialized() -> bool {
+    BROKER_CLIENT.get().is_some()
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..a3b051279d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -158,6 +158,8 @@ pub struct PageServerConf {
     pub synthetic_size_calculation_interval: Duration,
 
     pub test_remote_failures: u64,
+
+    pub ondemand_download_behavior_treat_error_as_warn: bool,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -222,6 +224,8 @@ struct PageServerConfigBuilder {
     synthetic_size_calculation_interval: BuilderValue<Duration>,
 
     test_remote_failures: BuilderValue<u64>,
+
+    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -267,6 +271,8 @@ impl Default for PageServerConfigBuilder {
             metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
             test_remote_failures: Set(0),
+
+            ondemand_download_behavior_treat_error_as_warn: Set(false),
         }
     }
 }
@@ -363,6 +369,14 @@ impl PageServerConfigBuilder {
         self.test_remote_failures = BuilderValue::Set(fail_first);
     }
 
+    pub fn ondemand_download_behavior_treat_error_as_warn(
+        &mut self,
+        ondemand_download_behavior_treat_error_as_warn: bool,
+    ) {
+        self.ondemand_download_behavior_treat_error_as_warn =
+            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         Ok(PageServerConf {
             listen_pg_addr: self
@@ -422,6 +436,11 @@ impl PageServerConfigBuilder {
             test_remote_failures: self
                 .test_remote_failures
                 .ok_or(anyhow!("missing test_remote_failuers"))?,
+            ondemand_download_behavior_treat_error_as_warn: self
+                .ondemand_download_behavior_treat_error_as_warn
+                .ok_or(anyhow!(
+                    "missing ondemand_download_behavior_treat_error_as_warn"
+                ))?,
         })
     }
 }
@@ -600,6 +619,7 @@ impl PageServerConf {
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -724,6 +744,7 @@ impl PageServerConf {
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             synthetic_size_calculation_interval: Duration::from_secs(60),
             test_remote_failures: 0,
+            ondemand_download_behavior_treat_error_as_warn: false,
         }
     }
 }
@@ -749,6 +770,11 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
     Ok(i as u64)
 }
 
+fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
+    item.as_bool()
+        .with_context(|| format!("configure option {name} is not a bool"))
+}
+
 fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
     let s = item
         .as_str()
@@ -907,6 +933,7 @@ log_format = 'json'
                     defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                 )?,
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -954,6 +981,7 @@ log_format = 'json'
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 synthetic_size_calculation_interval: Duration::from_secs(333),
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index c07026261d..d848ec5ee5 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,6 +3,7 @@
 //! and push them to a HTTP endpoint.
 //! Cache metrics to send only the updated ones.
 //!
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use anyhow;
@@ -47,12 +48,15 @@ pub async fn collect_metrics(
     metric_collection_interval: Duration,
     synthetic_size_calculation_interval: Duration,
     node_id: NodeId,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     let mut ticker = tokio::time::interval(metric_collection_interval);
 
     info!("starting collect_metrics");
 
     // spin up background worker that caclulates tenant sizes
+    let worker_ctx =
+        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::CalculateSyntheticSize,
@@ -61,7 +65,7 @@ pub async fn collect_metrics(
         "synthetic size calculation",
         false,
         async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval)
+            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
                 .instrument(info_span!("synthetic_size_worker"))
                 .await?;
             Ok(())
@@ -79,7 +83,7 @@ pub async fn collect_metrics(
                 return Ok(());
             },
             _ = ticker.tick() => {
-                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await
+                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await
                 {
                     error!("metrics collection failed: {err:?}");
                 }
@@ -102,6 +106,7 @@ pub async fn collect_metrics_iteration(
     cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
     node_id: NodeId,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
     trace!(
@@ -110,7 +115,7 @@ pub async fn collect_metrics_iteration(
     );
 
     // get list of tenants
-    let tenants = mgr::list_tenants().await;
+    let tenants = mgr::list_tenants().await?;
 
     // iterate through list of Active tenants and collect metrics
     for (tenant_id, tenant_state) in tenants {
@@ -137,7 +142,7 @@ pub async fn collect_metrics_iteration(
                     timeline_written_size,
                 ));
 
-                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?;
                 // Only send timeline logical size when it is fully calculated.
                 if is_exact {
                     current_metrics.push((
@@ -258,6 +263,7 @@ pub async fn collect_metrics_iteration(
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
     synthetic_size_calculation_interval: Duration,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("starting calculate_synthetic_size_worker");
 
@@ -270,7 +276,13 @@ pub async fn calculate_synthetic_size_worker(
             },
         _ = ticker.tick() => {
 
-                let tenants = mgr::list_tenants().await;
+                let tenants = match mgr::list_tenants().await {
+                    Ok(tenants) => tenants,
+                    Err(e) => {
+                        warn!("cannot get tenant list: {e:#}");
+                        continue;
+                    }
+                };
                 // iterate through list of Active tenants and collect metrics
                 for (tenant_id, tenant_state) in tenants {
 
@@ -280,7 +292,7 @@ pub async fn calculate_synthetic_size_worker(
 
                     if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                     {
-                        if let Err(e) = tenant.calculate_synthetic_size().await {
+                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
                             error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                         }
                     }
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
new file mode 100644
index 0000000000..e826d28e6d
--- /dev/null
+++ b/pageserver/src/context.rs
@@ -0,0 +1,199 @@
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//!
+//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
+//! 1. What high-level activity ([`TaskKind`]) needs this page?
+//!    We need that information as a categorical dimension for page access
+//!    statistics, which we, in turn, need to guide layer eviction policy design.
+//! 2. How should we behave if, to produce the page image, we need to
+//!    on-demand download a layer file ([`DownloadBehavior`]).
+//!
+//! [`RequestContext`] satisfies those needs.
+//! The current implementation is a small `struct` that is passed through
+//! the call chain by reference.
+//!
+//! ### Future Work
+//!
+//! However, we do not intend to stop here, since there are other needs that
+//! require carrying information from high to low levels of the app.
+//!
+//! Most importantly, **cancellation signaling** in response to
+//! 1. timeouts (page_service max response time) and
+//! 2. lifecycle requests (detach tenant, delete timeline).
+//!
+//! Related to that, there is sometimes a need to ensure that all tokio tasks spawned
+//! by the transitive callees of a request have finished. The keyword here
+//! is **Structured Concurrency**, and right now, we use `task_mgr` in most places,
+//! `TaskHandle` in some places, and careful code review around `FuturesUnordered`
+//! or `JoinSet` in other places.
+//!
+//! We do not yet have a systematic cancellation story in pageserver, and it is
+//! pretty clear that [`RequestContext`] will be responsible for that.
+//! So, the API already prepares for this role through the
+//! [`RequestContext::detached_child`] and [`RequestContext::attached_child`]  methods.
+//! See their doc comments for details on how we will use them in the future.
+//!
+//! It is not clear whether or how we will enforce Structured Concurrency, and
+//! what role [`RequestContext`] will play there.
+//! So, the API doesn't prepare us for this topic.
+//!
+//! Other future uses of `RequestContext`:
+//! - Communicate compute & IO priorities (user-initiated request vs. background-loop)
+//! - Request IDs for distributed tracing
+//! - Request/Timeline/Tenant-scoped log levels
+//!
+//! RequestContext might look quite different once it supports those features.
+//! Likely, it will have a shape similar to Golang's `context.Context`.
+//!
+//! ### Why A Struct Instead Of Method Parameters
+//!
+//! What's typical about such information is that it needs to be passed down
+//! along the call chain from high level to low level, but few of the functions
+//! in the middle need to understand it.
+//! Further, it is to be expected that we will need to propagate more data
+//! in the future (see the earlier section on future work).
+//! Hence, for functions in the middle of the call chain, we have the following
+//! requirements:
+//! 1. It should be easy to forward the context to callees.
+//! 2. To propagate more data from high-level to low-level code, the functions in
+//!    the middle should not need to be modified.
+//! The solution is to have a container structure ([`RequestContext`]) that
+//! carries the information. Functions that don't care about what's in it
+//! pass it along to callees.
+//!
+//! ### Why Not Task-Local Variables
+//!
+//! One could use task-local variables (the equivalent of thread-local variables)
+//! to address the immediate needs outlined above.
+//! However, we reject task-local variables because:
+//! 1. they are implicit, thereby making it harder to trace the data flow in code
+//!    reviews and during debugging,
+//! 2. they can be mutable, which enables implicit return data flow,
+//! 3. they are restrictive in that code which fans out into multiple tasks,
+//!    or even threads, needs to carefully propagate the state.
+//!
+//! In contrast, information flow with [`RequestContext`] is
+//! 1. always explicit,
+//! 2. strictly uni-directional because RequestContext is immutable,
+//! 3. tangible because a [`RequestContext`] is just a value.
+//!    When creating child activities, regardless of whether it's a task,
+//!    thread, or even an RPC to another service, the value can
+//!    be used like any other argument.
+//!
+//! The solution is that all code paths are infected with precisely one
+//! [`RequestContext`] argument. Functions in the middle of the call chain
+//! only need to pass it on.
+use crate::task_mgr::TaskKind;
+
+// The main structure of this module, see module-level comment.
+pub struct RequestContext {
+    task_kind: TaskKind,
+    download_behavior: DownloadBehavior,
+}
+
+/// Desired behavior if the operation requires an on-demand download
+/// to proceed.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum DownloadBehavior {
+    /// Download the layer file. It can take a while.
+    Download,
+
+    /// Download the layer file, but print a warning to the log. This should be used
+    /// in code where the layer file is expected to already exist locally.
+    Warn,
+
+    /// Return a PageReconstructError::NeedsDownload error
+    Error,
+}
+
+impl RequestContext {
+    /// Create a new RequestContext that has no parent.
+    ///
+    /// The function is called `new` because, once we add children
+    /// to it using `detached_child` or `attached_child`, the context
+    /// form a tree (not implemented yet since cancellation will be
+    /// the first feature that requires a tree).
+    ///
+    /// # Future: Cancellation
+    ///
+    /// The only reason why a context like this one can be canceled is
+    /// because someone explicitly canceled it.
+    /// It has no parent, so it cannot inherit cancellation from there.
+    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    /// Create a detached child context for a task that may outlive `self`.
+    ///
+    /// Use this when spawning new background activity that should complete
+    /// even if the current request is canceled.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancellation of `self` will not propagate to the child context returned
+    /// by this method.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and pass to the new task the child context as an argument.
+    /// That would be an ergonomic improvement.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        self.child_impl(task_kind, download_behavior)
+    }
+
+    /// Create a child of context `self` for a task that shall not outlive `self`.
+    ///
+    /// Use this when fanning-out work to other async tasks.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancelling a context will propagate to its attached children.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and track its `JoinHandle` inside the `RequestContext`.
+    ///
+    /// We could then provide another method to allow waiting for all child tasks
+    /// to finish.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    /// Alternatively, we could allow the creation but not spawn the task.
+    /// The method to wait for child tasks would return an error, indicating
+    /// that the child task was not started because the context was canceled.
+    pub fn attached_child(&self) -> Self {
+        self.child_impl(self.task_kind(), self.download_behavior())
+    }
+
+    /// Use this function when you should be creating a child context using
+    /// [`attached_child`] or [`detached_child`], but your caller doesn't provide
+    /// a context and you are unwilling to change all callers to provide one.
+    ///
+    /// Before we add cancellation, we should get rid of this method.
+    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        Self::new(task_kind, download_behavior)
+    }
+
+    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    pub fn task_kind(&self) -> TaskKind {
+        self.task_kind
+    }
+
+    pub fn download_behavior(&self) -> DownloadBehavior {
+        self.download_behavior
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index f9b8a81dad..23faff7ace 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -430,6 +430,13 @@ paths:
         schema:
           type: string
           format: hex
+      - name: inputs_only
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: |
+          When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
     get:
       description: |
         Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
@@ -449,8 +456,9 @@ paths:
                     format: hex
                   size:
                     type: integer
+                    nullable: true
                     description: |
-                      Size metric in bytes.
+                      Size metric in bytes or null if inputs_only=true was given.
         "401":
           description: Unauthorized Error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1eb24c1507..a7802f3cbe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,8 +12,11 @@ use super::models::{
     StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
     TimelineCreateRequest, TimelineInfo,
 };
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::mgr::TenantMapInsertError;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
@@ -81,18 +84,39 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
     match err {
         PageReconstructError::Other(err) => ApiError::InternalServerError(err),
+        PageReconstructError::NeedsDownload(_, _) => {
+            // This shouldn't happen, because we use a RequestContext that requests to
+            // download any missing layer files on-demand.
+            ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+        }
+        PageReconstructError::Cancelled => {
+            ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+        }
         PageReconstructError::WalRedo(err) => {
             ApiError::InternalServerError(anyhow::Error::new(err))
         }
     }
 }
 
+fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
+    match e {
+        TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+            ApiError::InternalServerError(anyhow::Error::new(e))
+        }
+        TenantMapInsertError::TenantAlreadyExists(id, state) => {
+            ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+        }
+        TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
+    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
-    let mut info = build_timeline_info_common(timeline)?;
+    let mut info = build_timeline_info_common(timeline, ctx)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
         // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -102,6 +126,7 @@ async fn build_timeline_info(
                 .get_current_logical_size_non_incremental(
                     info.last_record_lsn,
                     CancellationToken::new(),
+                    ctx,
                 )
                 .await?,
         );
@@ -109,7 +134,10 @@ async fn build_timeline_info(
     Ok(info)
 }
 
-fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
+fn build_timeline_info_common(
+    timeline: &Arc<Timeline>,
+    ctx: &RequestContext,
+) -> anyhow::Result<TimelineInfo> {
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -129,7 +157,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         Lsn(0) => None,
         lsn @ Lsn(_) => Some(lsn),
     };
-    let current_logical_size = match timeline.get_current_logical_size() {
+    let current_logical_size = match timeline.get_current_logical_size(ctx) {
         Ok((size, _)) => Some(size),
         Err(err) => {
             error!("Timeline info creation failed to get current logical size: {err:?}");
@@ -180,6 +208,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
+
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -187,13 +217,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         new_timeline_id,
         request_data.ancestor_timeline_id.map(TimelineId::from),
         request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
+        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+        &ctx,
     )
     .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await {
         Ok(Some(new_timeline)) => {
             // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline)
+            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                 .map_err(ApiError::InternalServerError)?;
             json_response(StatusCode::CREATED, timeline_info)
         }
@@ -208,6 +239,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let response_data = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -217,7 +250,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
             let timeline_info =
-                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
                     .await
                     .context(
                         "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
@@ -239,11 +272,7 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
     request
         .uri()
         .query()
-        .map(|v| {
-            url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
-                .any(|(p, _)| p == param)
-        })
+        .map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
         .unwrap_or(false)
 }
 
@@ -252,13 +281,12 @@ fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String,
         Err(ApiError::BadRequest(anyhow!("empty query in request"))),
         |v| {
             url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
                 .find(|(k, _)| k == param_name)
                 .map_or(
                     Err(ApiError::BadRequest(anyhow!(
                         "no {param_name} specified in query parameters"
                     ))),
-                    |(_, v)| Ok(v),
+                    |(_, v)| Ok(v.into_owned()),
                 )
         },
     )
@@ -271,6 +299,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    // Logical size calculation needs downloading.
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let timeline_info = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -280,10 +311,11 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .get_timeline(timeline_id, false)
             .map_err(ApiError::NotFound)?;
 
-        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
-            .await
-            .context("Failed to get local timeline info: {e:#}")
-            .map_err(ApiError::InternalServerError)?;
+        let timeline_info =
+            build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
+                .await
+                .context("get local timeline info")
+                .map_err(ApiError::InternalServerError)?;
 
         Ok::<_, ApiError>(timeline_info)
     }
@@ -304,12 +336,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let timeline = mgr::get_tenant(tenant_id, true)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
     let result = timeline
-        .find_lsn_for_timestamp(timestamp_pg)
+        .find_lsn_for_timestamp(timestamp_pg, &ctx)
         .await
         .map_err(apierror_from_prerror)?;
 
@@ -327,16 +360,17 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     info!("Handling tenant attach {tenant_id}");
 
     let state = get_state(&request);
 
     if let Some(remote_storage) = &state.remote_storage {
-        // FIXME: distinguish between "Tenant already exists" and other errors
-        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(apierror_from_tenant_map_insert_error)?;
     } else {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
@@ -351,7 +385,9 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    mgr::delete_timeline(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await
         // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -382,11 +418,13 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
         .instrument(info_span!("load", tenant = %tenant_id))
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(apierror_from_tenant_map_insert_error)?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -413,6 +451,8 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
     let response_data = mgr::list_tenants()
         .instrument(info_span!("tenant_list"))
         .await
+        .map_err(anyhow::Error::new)
+        .map_err(ApiError::InternalServerError)?
         .iter()
         .map(|(id, state)| TenantInfo {
             id: *id,
@@ -453,21 +493,40 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     json_response(StatusCode::OK, tenant_info)
 }
 
+/// HTTP endpoint to query the current tenant_size of a tenant.
+///
+/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
+/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
+/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
+/// values.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let inputs_only = if query_param_present(&request, "inputs_only") {
+        get_query_param(&request, "inputs_only")?
+            .parse()
+            .map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
+    } else {
+        false
+    };
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    // this can be long operation, it currently is not backed by any request coalescing or similar
+    // this can be long operation
     let inputs = tenant
-        .gather_size_inputs()
+        .gather_size_inputs(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+    let size = if !inputs_only {
+        Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
+    } else {
+        None
+    };
 
     /// Private response type with the additional "unstable" `inputs` field.
     ///
@@ -479,7 +538,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         #[serde_as(as = "serde_with::DisplayFromStr")]
         id: TenantId,
         /// Size is a mixture of WAL and logical size, so the unit is bytes.
-        size: u64,
+        ///
+        /// Will be none if `?inputs_only=true` was given.
+        size: Option<u64>,
         inputs: crate::tenant::size::ModelInputs,
     }
 
@@ -506,6 +567,8 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let request_data: TenantCreateRequest = json_request(&mut request).await?;
 
     let mut tenant_conf = TenantConfOpt::default();
@@ -583,34 +646,28 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         tenant_conf,
         target_tenant_id,
         state.remote_storage.clone(),
+        &ctx,
     )
     .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
     .await
-    // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
-    // with better error handling once the type permits it
-    .map_err(ApiError::InternalServerError)?;
+    .map_err(apierror_from_tenant_map_insert_error)?;
 
-    Ok(match new_tenant {
-        Some(tenant) => {
-            // We created the tenant. Existing API semantics are that the tenant
-            // is Active when this function returns.
-            if let res @ Err(_) = tenant.wait_to_become_active().await {
-                // This shouldn't happen because we just created the tenant directory
-                // in tenant::mgr::create_tenant, and there aren't any remote timelines
-                // to load, so, nothing can really fail during load.
-                // Don't do cleanup because we don't know how we got here.
-                // The tenant will likely be in `Broken` state and subsequent
-                // calls will fail.
-                res.context("created tenant failed to become active")
-                    .map_err(ApiError::InternalServerError)?;
-            }
-            json_response(
-                StatusCode::CREATED,
-                TenantCreateResponse(tenant.tenant_id()),
-            )?
-        }
-        None => json_response(StatusCode::CONFLICT, ())?,
-    })
+    // We created the tenant. Existing API semantics are that the tenant
+    // is Active when this function returns.
+    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+        // This shouldn't happen because we just created the tenant directory
+        // in tenant::mgr::create_tenant, and there aren't any remote timelines
+        // to load, so, nothing can really fail during load.
+        // Don't do cleanup because we don't know how we got here.
+        // The tenant will likely be in `Broken` state and subsequent
+        // calls will fail.
+        res.context("created tenant failed to become active")
+            .map_err(ApiError::InternalServerError)?;
+    }
+    json_response(
+        StatusCode::CREATED,
+        TenantCreateResponse(new_tenant.tenant_id()),
+    )
 }
 
 async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -732,7 +789,8 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -749,7 +807,8 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
         .await
         .context("spawn compaction task")
         .map_err(ApiError::InternalServerError)?;
@@ -770,6 +829,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -781,7 +841,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
         .await
         .map_err(ApiError::InternalServerError)?;
     timeline
-        .compact()
+        .compact(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 3fd4bf12ca..39e434a023 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,6 +12,7 @@ use tokio_tar::Archive;
 use tracing::*;
 use walkdir::WalkDir;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
@@ -47,6 +48,7 @@ pub async fn import_timeline_from_postgres_datadir(
     tline: &Timeline,
     pgdata_path: &Path,
     pgdata_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     let mut pg_control: Option<ControlFileData> = None;
 
@@ -69,7 +71,7 @@ pub async fn import_timeline_from_postgres_datadir(
             let mut file = tokio::fs::File::open(absolute_path).await?;
             let len = metadata.len() as usize;
             if let Some(control_file) =
-                import_file(&mut modification, relative_path, &mut file, len).await?
+                import_file(&mut modification, relative_path, &mut file, len, ctx).await?
             {
                 pg_control = Some(control_file);
             }
@@ -99,6 +101,7 @@ pub async fn import_timeline_from_postgres_datadir(
         tline,
         Lsn(pg_control.checkPointCopy.redo),
         pgdata_lsn,
+        ctx,
     )
     .await?;
 
@@ -113,6 +116,7 @@ async fn import_rel(
     dboid: Oid,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     // Does it look like a relation file?
     trace!("importing rel file {}", path.display());
@@ -147,7 +151,10 @@ async fn import_rel(
     // FIXME: use proper error type for this, instead of parsing the error message.
     // Or better yet, keep track of which relations we've already created
     // https://github.com/neondatabase/neon/issues/3309
-    if let Err(e) = modification.put_rel_creation(rel, nblocks as u32).await {
+    if let Err(e) = modification
+        .put_rel_creation(rel, nblocks as u32, ctx)
+        .await
+    {
         if e.to_string().contains("already exists") {
             debug!("relation {} already exists. we must be extending it", rel);
         } else {
@@ -182,7 +189,7 @@ async fn import_rel(
     //
     // If we process rel segments out of order,
     // put_rel_extend will skip the update.
-    modification.put_rel_extend(rel, blknum).await?;
+    modification.put_rel_extend(rel, blknum, ctx).await?;
 
     Ok(())
 }
@@ -195,6 +202,7 @@ async fn import_slru(
     path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("importing slru file {path:?}");
 
@@ -211,7 +219,7 @@ async fn import_slru(
     ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
 
     modification
-        .put_slru_segment_creation(slru, segno, nblocks as u32)
+        .put_slru_segment_creation(slru, segno, nblocks as u32, ctx)
         .await?;
 
     let mut rpageno = 0;
@@ -252,15 +260,15 @@ async fn import_wal(
     tline: &Timeline,
     startpoint: Lsn,
     endpoint: Lsn,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
-    use std::io::Read;
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint).await?;
+    let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -283,6 +291,7 @@ async fn import_wal(
             file.seek(std::io::SeekFrom::Start(offset as u64))?;
         }
 
+        use std::io::Read;
         let nread = file.read_to_end(&mut buf)?;
         if nread != WAL_SEGMENT_SIZE - offset {
             // Maybe allow this for .partial files?
@@ -297,7 +306,7 @@ async fn import_wal(
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -326,6 +335,7 @@ pub async fn import_basebackup_from_tar(
     tline: &Timeline,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     base_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     info!("importing base at {base_lsn}");
     let mut modification = tline.begin_modification(base_lsn);
@@ -344,7 +354,7 @@ pub async fn import_basebackup_from_tar(
         match header.entry_type() {
             tokio_tar::EntryType::Regular => {
                 if let Some(res) =
-                    import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+                    import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await?
                 {
                     // We found the pg_control file.
                     pg_control = Some(res);
@@ -376,13 +386,14 @@ pub async fn import_wal_from_tar(
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     start_lsn: Lsn,
     end_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     // Set up walingest mutable state
     let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn).await?;
+    let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -431,7 +442,7 @@ pub async fn import_wal_from_tar(
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -466,6 +477,7 @@ async fn import_file(
     file_path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> Result<Option<ControlFileData>> {
     let file_name = match file_path.file_name() {
         Some(name) => name.to_string_lossy(),
@@ -498,14 +510,16 @@ async fn import_file(
             }
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
@@ -521,38 +535,40 @@ async fn import_file(
         match file_name.as_ref() {
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported clog slru");
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact offsets slru");
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
         let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
         let bytes = read_all_bytes(reader).await?;
         modification
-            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))
+            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
             .await?;
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 91cde477ad..09e21ae755 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,7 +1,9 @@
 mod auth;
 pub mod basebackup;
+pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
+pub mod context;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
@@ -15,7 +17,6 @@ pub mod tenant;
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
-pub mod walreceiver;
 pub mod walrecord;
 pub mod walredo;
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b61e64048b..6bd0eddbb5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,10 +1,12 @@
 use metrics::core::{AtomicU64, GenericCounter};
 use metrics::{
-    register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
-    register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
-    IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::state;
 use utils::id::{TenantId, TimelineId};
 
 /// Prometheus histogram buckets (in seconds) that capture the majority of
@@ -35,11 +37,29 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
     "gc",
 ];
 
-pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_storage_operations_seconds",
-        "Time spent on storage operations",
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+    register_counter_vec!(
+        "pageserver_storage_operations_seconds_sum",
+        "Total time spent on storage operations with operation, tenant and timeline dimensions",
         &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_storage_operations_seconds_count",
+        "Count of storage operations with operation, tenant and timeline dimensions",
+        &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_storage_operations_seconds_global",
+        "Time spent on storage operations",
+        &["operation"],
         get_buckets_for_critical_operations(),
     )
     .expect("failed to define a metric")
@@ -112,6 +132,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+// Metrics collected on tenant states.
+const TENANT_STATE_OPTIONS: &[&str] = &[
+    state::LOADING,
+    state::ATTACHING,
+    state::ACTIVE,
+    state::STOPPING,
+    state::BROKEN,
+];
+
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_tenant_states_count",
+        "Count of tenants per state",
+        &["tenant_id", "state"]
+    )
+    .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
 // or in testing they estimate how much we would upload if we did.
 static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -375,18 +413,81 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
+pub struct StorageTimeMetricsTimer {
+    metrics: StorageTimeMetrics,
+    start: Instant,
+}
+
+impl StorageTimeMetricsTimer {
+    fn new(metrics: StorageTimeMetrics) -> Self {
+        Self {
+            metrics,
+            start: Instant::now(),
+        }
+    }
+
+    /// Record the time from creation to now.
+    pub fn stop_and_record(self) {
+        let duration = self.start.elapsed().as_secs_f64();
+        self.metrics.timeline_sum.inc_by(duration);
+        self.metrics.timeline_count.inc();
+        self.metrics.global_histogram.observe(duration);
+    }
+}
+
+/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
+/// timeline total sum and count.
+#[derive(Clone, Debug)]
+pub struct StorageTimeMetrics {
+    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
+    timeline_sum: Counter,
+    /// Number of oeprations, per operation, tenant_id and timeline_id
+    timeline_count: IntCounter,
+    /// Global histogram having only the "operation" label.
+    global_histogram: Histogram,
+}
+
+impl StorageTimeMetrics {
+    pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
+        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let global_histogram = STORAGE_TIME_GLOBAL
+            .get_metric_with_label_values(&[operation])
+            .unwrap();
+
+        StorageTimeMetrics {
+            timeline_sum,
+            timeline_count,
+            global_histogram,
+        }
+    }
+
+    /// Starts timing a new operation.
+    ///
+    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
+    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
+        StorageTimeMetricsTimer::new(self.clone())
+    }
+}
+
 #[derive(Debug)]
 pub struct TimelineMetrics {
     tenant_id: String,
     timeline_id: String,
     pub reconstruct_time_histo: Histogram,
     pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
-    pub flush_time_histo: Histogram,
-    pub compact_time_histo: Histogram,
-    pub create_images_time_histo: Histogram,
-    pub init_logical_size_histo: Histogram,
-    pub logical_size_histo: Histogram,
-    pub load_layer_map_histo: Histogram,
+    pub flush_time_histo: StorageTimeMetrics,
+    pub compact_time_histo: StorageTimeMetrics,
+    pub create_images_time_histo: StorageTimeMetrics,
+    pub init_logical_size_histo: StorageTimeMetrics,
+    pub logical_size_histo: StorageTimeMetrics,
+    pub load_layer_map_histo: StorageTimeMetrics,
+    pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
     pub resident_physical_size_gauge: UIntGauge,
@@ -406,24 +507,16 @@ impl TimelineMetrics {
         let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let flush_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id])
-            .unwrap();
-        let compact_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id])
-            .unwrap();
-        let create_images_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id])
-            .unwrap();
-        let init_logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let load_layer_map_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
-            .unwrap();
+        let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
+        let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
+        let create_images_time_histo =
+            StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
+        let init_logical_size_histo =
+            StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
+        let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+        let load_layer_map_histo =
+            StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
+        let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
@@ -453,6 +546,7 @@ impl TimelineMetrics {
             create_images_time_histo,
             init_logical_size_histo,
             logical_size_histo,
+            garbage_collect_histo,
             load_layer_map_histo,
             last_record_gauge,
             wait_lsn_time_histo,
@@ -478,7 +572,10 @@ impl Drop for TimelineMetrics {
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
 
         for op in STORAGE_TIME_OPERATIONS {
-            let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
         }
         for op in STORAGE_IO_TIME_OPERATIONS {
             let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -495,7 +592,10 @@ impl Drop for TimelineMetrics {
 }
 
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
-    let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
+    let tid = tenant_id.to_string();
+    for state in TENANT_STATE_OPTIONS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }
 
 use futures::Future;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 344a8d1c00..878928ae06 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,6 +13,7 @@ use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -30,19 +31,19 @@ use std::sync::Arc;
 use std::time::Duration;
 use tracing::*;
 use utils::id::ConnectionId;
-use utils::postgres_backend_async::QueryError;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
     lsn::Lsn,
     postgres_backend::AuthType,
-    postgres_backend_async::{self, PostgresBackend},
+    postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
     simple_rcu::RcuReadGuard,
 };
 
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
@@ -123,6 +124,7 @@ pub async fn libpq_listener_main(
     auth: Option<Arc<JwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
+    listener_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     listener.set_nonblocking(true)?;
     let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -146,6 +148,9 @@ pub async fn libpq_listener_main(
                 debug!("accepted connection from {}", peer_addr);
                 let local_auth = auth.clone();
 
+                let connection_ctx = listener_ctx
+                    .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
+
                 // PageRequestHandler tasks are not associated with any particular
                 // timeline in the task manager. In practice most connections will
                 // only deal with a particular timeline, but we don't know which one
@@ -157,7 +162,7 @@ pub async fn libpq_listener_main(
                     None,
                     "serving compute connection task",
                     false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type),
+                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
                 );
             }
             Err(err) => {
@@ -177,6 +182,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<JwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
+    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Immediately increment the gauge, then create a job to decrement it on task exit.
     // One of the pros of `defer!` is that this will *most probably*
@@ -191,24 +197,24 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
-    let mut conn_handler = PageServerHandler::new(conf, auth);
+    // XXX: pgbackend.run() should take the connection_ctx,
+    // and create a child per-query context when it invokes process_query.
+    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
+    // and create the per-query context in process_query ourselves.
+    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
     let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
 
-    let result = pgbackend
+    match pgbackend
         .run(&mut conn_handler, task_mgr::shutdown_watcher)
-        .await;
-    match result {
+        .await
+    {
         Ok(()) => {
             // we've been requested to shut down
             Ok(())
         }
         Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
-            // `ConnectionReset` error happens when the Postgres client closes the connection.
-            // As this disconnection happens quite often and is expected,
-            // we decided to downgrade the logging level to `INFO`.
-            // See: https://github.com/neondatabase/neon/issues/1683.
-            if io_error.kind() == io::ErrorKind::ConnectionReset {
-                info!("Postgres client disconnected");
+            if is_expected_io_error(&io_error) {
+                info!("Postgres client disconnected ({io_error})");
                 Ok(())
             } else {
                 Err(io_error).context("Postgres connection error")
@@ -255,30 +261,42 @@ struct PageServerHandler {
     _conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
+
+    /// The context created for the lifetime of the connection
+    /// services by this PageServerHandler.
+    /// For each query received over the connection,
+    /// `process_query` creates a child context from this one.
+    connection_ctx: RequestContext,
 }
 
 impl PageServerHandler {
-    pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    pub fn new(
+        conf: &'static PageServerConf,
+        auth: Option<Arc<JwtAuth>>,
+        connection_ctx: RequestContext,
+    ) -> Self {
         PageServerHandler {
             _conf: conf,
             auth,
             claims: None,
+            connection_ctx,
         }
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_pagerequests(
         &self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // NOTE: pagerequests handler exits when connection is closed,
         //       so there is no need to reset the association
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         // Make request tracer if needed
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
         let mut tracer = if tenant.get_trace_read_requests() {
             let connection_id = ConnectionId::generate();
             let path = tenant
@@ -329,22 +347,27 @@ impl PageServerHandler {
 
             let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
 
+            // TODO: We could create a new per-request context here, with unique ID.
+            // Currently we use the same per-timeline context for all requests
+
             let response = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
                     let _timer = metrics.get_rel_exists.start_timer();
-                    self.handle_get_rel_exists_request(&timeline, &req).await
+                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::Nblocks(req) => {
                     let _timer = metrics.get_rel_size.start_timer();
-                    self.handle_get_nblocks_request(&timeline, &req).await
+                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
                 }
                 PagestreamFeMessage::GetPage(req) => {
                     let _timer = metrics.get_page_at_lsn.start_timer();
-                    self.handle_get_page_at_lsn_request(&timeline, &req).await
+                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::DbSize(req) => {
                     let _timer = metrics.get_db_size.start_timer();
-                    self.handle_db_size_request(&timeline, &req).await
+                    self.handle_db_size_request(&timeline, &req, &ctx).await
                 }
             };
 
@@ -363,7 +386,8 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_basebackup(
         &self,
         pgb: &mut PostgresBackend,
@@ -372,12 +396,13 @@ impl PageServerHandler {
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
-        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
 
         // TODO mark timeline as not ready until it reaches end_lsn.
         // We might have some wal to import as well, and we should prevent compute
@@ -396,7 +421,7 @@ impl PageServerHandler {
 
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         timeline
-            .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
+            .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
             .await?;
 
         // Drain the rest of the Copy data
@@ -418,7 +443,7 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_wal(
         &self,
         pgb: &mut PostgresBackend,
@@ -426,10 +451,11 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let last_record_lsn = timeline.get_last_record_lsn();
         if last_record_lsn != start_lsn {
             return Err(QueryError::Other(
@@ -446,7 +472,7 @@ impl PageServerHandler {
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
-        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
+        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -492,6 +518,7 @@ impl PageServerHandler {
         mut lsn: Lsn,
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Lsn> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
@@ -515,7 +542,7 @@ impl PageServerHandler {
             if lsn <= last_record_lsn {
                 lsn = last_record_lsn;
             } else {
-                timeline.wait_lsn(lsn).await?;
+                timeline.wait_lsn(lsn, ctx).await?;
                 // Since we waited for 'lsn' to arrive, that is now the last
                 // record LSN. (Or close enough for our purposes; the
                 // last-record LSN can advance immediately after we return
@@ -525,7 +552,7 @@ impl PageServerHandler {
             if lsn == Lsn(0) {
                 anyhow::bail!("invalid LSN(0) in request");
             }
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
         }
         anyhow::ensure!(
             lsn >= **latest_gc_cutoff_lsn,
@@ -535,52 +562,60 @@ impl PageServerHandler {
         Ok(lsn)
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_rel_exists_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest).await?;
+        let exists = timeline
+            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
             exists,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_nblocks_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest).await?;
+        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
     async fn handle_db_size_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
         let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -589,15 +624,17 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
     async fn handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
         /*
         // Add a 1s delay to some requests. The delay helps the requests to
         // hit the race condition from github issue #1047 more easily.
@@ -608,7 +645,7 @@ impl PageServerHandler {
         */
 
         let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
             .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -616,23 +653,25 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_basebackup_request(
-        &self,
+        &mut self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
         full_backup: bool,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // check that the timeline exists
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, &ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -645,8 +684,15 @@ impl PageServerHandler {
         // Send a tarball of the latest layer on the timeline
         {
             let mut writer = pgb.copyout_writer();
-            basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
-                .await?;
+            basebackup::send_basebackup_tarball(
+                &mut writer,
+                &timeline,
+                lsn,
+                prev_lsn,
+                full_backup,
+                &ctx,
+            )
+            .await?;
         }
 
         pgb.write_message(&BeMessage::CopyDone)?;
@@ -717,6 +763,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
         pgb: &mut PostgresBackend,
         query_string: &str,
     ) -> Result<(), QueryError> {
+        let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
 
         if query_string.starts_with("pagestream ") {
@@ -734,7 +781,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            self.handle_pagerequests(pgb, tenant_id, timeline_id)
+            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
                 .await?;
         } else if query_string.starts_with("basebackup ") {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
@@ -763,7 +810,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             };
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         }
@@ -784,7 +831,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
-            let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
 
             let end_of_timeline = timeline.get_last_record_rlsn();
 
@@ -835,7 +882,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("import basebackup ") {
@@ -878,6 +925,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                     base_lsn,
                     end_lsn,
                     pg_version,
+                    ctx,
                 )
                 .await
             {
@@ -914,7 +962,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn)
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                 .await
             {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
@@ -944,7 +992,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
             pgb.write_message(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -990,27 +1038,66 @@ impl postgres_backend_async::Handler for PageServerHandler {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+    #[error(
+        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+    )]
+    WaitForActiveTimeout {
+        latest_state: TenantState,
+        wait_time: Duration,
+    },
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<GetActiveTenantError> for QueryError {
+    fn from(e: GetActiveTenantError) -> Self {
+        match e {
+            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
+                ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+            ),
+            GetActiveTenantError::Other(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Get active tenant.
 ///
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
+async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
     let tenant = mgr::get_tenant(tenant_id, false).await?;
-    match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
-        Ok(wait_result) => wait_result
-            // no .context(), the error message is good enough and some tests depend on it
-            .map(move |()| tenant),
-        Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
+    let wait_time = Duration::from_secs(30);
+    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+        Ok(Ok(())) => Ok(tenant),
+        // no .context(), the error message is good enough and some tests depend on it
+        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Err(_) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state,
+                    wait_time,
+                })
+            }
+        }
     }
 }
 
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
-async fn get_active_timeline_with_timeout(
+async fn get_active_tenant_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-) -> anyhow::Result<Arc<Timeline>> {
-    get_active_tenant_with_timeout(tenant_id)
-        .await
-        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
+    ctx: &RequestContext,
+) -> Result<Arc<Timeline>, GetActiveTenantError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+    Ok(timeline)
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index cc521c5e35..6f9035305d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,6 +7,7 @@
 //! Clarify that)
 //!
 use super::tenant::{PageReconstructError, Timeline};
+use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
@@ -97,6 +98,7 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -104,7 +106,7 @@ impl Timeline {
             )));
         }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest).await?;
+        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -114,7 +116,7 @@ impl Timeline {
         }
 
         let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     // Get size of a database in blocks
@@ -124,13 +126,14 @@ impl Timeline {
         dbnode: Oid,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn).await?;
+        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest).await?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -142,6 +145,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -154,7 +158,7 @@ impl Timeline {
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest).await?
+            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -164,7 +168,7 @@ impl Timeline {
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -186,6 +190,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         _latest: bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -199,7 +204,7 @@ impl Timeline {
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -216,10 +221,11 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -244,9 +250,10 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     /// Get size of an SLRU segment
@@ -255,9 +262,10 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         Ok(buf.get_u32_le())
     }
 
@@ -267,10 +275,11 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -291,6 +300,7 @@ impl Timeline {
     pub async fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
+        ctx: &RequestContext,
     ) -> Result<LsnForTimestamp, PageReconstructError> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
@@ -313,6 +323,7 @@ impl Timeline {
                     Lsn(mid * 8),
                     &mut found_smaller,
                     &mut found_larger,
+                    ctx,
                 )
                 .await?;
 
@@ -362,14 +373,18 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn).await? {
+        for segno in self
+            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .await?
+        {
             let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)
+                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
                 .await?;
             for blknum in (0..nblocks).rev() {
                 let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)
+                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
                     .await?;
 
                 if clog_page.len() == BLCKSZ as usize + 8 {
@@ -394,11 +409,12 @@ impl Timeline {
         &self,
         kind: SlruKind,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<u32>, PageReconstructError> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.segments),
             Err(e) => Err(PageReconstructError::from(e)),
@@ -410,18 +426,21 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        self.get(key, lsn).await
+        let buf = self.get(key, lsn, ctx).await?;
+        Ok(buf)
     }
 
     pub async fn list_dbdirs(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
 
         match DbDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.dbdirs),
@@ -433,18 +452,20 @@ impl Timeline {
         &self,
         xid: TransactionId,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         Ok(buf)
     }
 
     pub async fn list_twophase_files(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<TransactionId>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
         match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.xids),
@@ -452,12 +473,20 @@ impl Timeline {
         }
     }
 
-    pub async fn get_control_file(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CONTROLFILE_KEY, lsn).await
+    pub async fn get_control_file(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CONTROLFILE_KEY, lsn, ctx).await
     }
 
-    pub async fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CHECKPOINT_KEY, lsn).await
+    pub async fn get_checkpoint(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
     /// Does the same as get_current_logical_size but counted on demand.
@@ -469,15 +498,16 @@ impl Timeline {
         &self,
         lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await.context("read dbdir")?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
             for rel in self
-                .list_rels(*spcnode, *dbnode, lsn)
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
                 .await
                 .context("list rels")?
             {
@@ -486,9 +516,9 @@ impl Timeline {
                 }
                 let relsize_key = rel_size_to_key(rel);
                 let mut buf = self
-                    .get(relsize_key, lsn)
+                    .get(relsize_key, lsn, ctx)
                     .await
-                    .context("read relation size of {rel:?}")?;
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
                 let relsize = buf.get_u32_le();
 
                 total_size += relsize as u64;
@@ -501,7 +531,11 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
+    pub async fn collect_keyspace(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -509,7 +543,7 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
         let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
@@ -519,14 +553,14 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)
+                .list_rels(spcnode, dbnode, lsn, ctx)
                 .await?
                 .into_iter()
                 .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn).await?;
+                let mut buf = self.get(relsize_key, lsn, ctx).await?;
                 let relsize = buf.get_u32_le();
 
                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -542,13 +576,13 @@ impl Timeline {
         ] {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn).await?;
+            let buf = self.get(slrudir_key, lsn, ctx).await?;
             let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
                 let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn).await?;
+                let mut buf = self.get(segsize_key, lsn, ctx).await?;
                 let segsize = buf.get_u32_le();
 
                 result.add_range(
@@ -560,7 +594,7 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
         let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
@@ -723,9 +757,10 @@ impl<'a> DatadirModification<'a> {
         spcnode: Oid,
         dbnode: Oid,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
@@ -755,9 +790,10 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         xid: TransactionId,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
         if !dir.xids.insert(xid) {
             anyhow::bail!("twophase file for xid {} already exists", xid);
@@ -781,16 +817,21 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub async fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
+    pub async fn drop_dbdir(
+        &mut self,
+        spcnode: Oid,
+        dbnode: Oid,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
             .await?;
 
         // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
@@ -817,11 +858,12 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).await?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
         let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
             // Didn't exist. Update dbdir
@@ -833,7 +875,7 @@ impl<'a> DatadirModification<'a> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key).await?)?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
         };
 
         // Add the new relation to the rel directory entry, and write it back
@@ -865,13 +907,14 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true).await? {
+        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
-            let old_size = self.get(size_key).await?.get_u32_le();
+            let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
             // Update the entry with the new size.
             let buf = nblocks.to_le_bytes();
@@ -895,12 +938,13 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
         // only extend relation here. never decrease the size
         if nblocks > old_size {
@@ -916,12 +960,12 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relation.
-    pub async fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
+    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
         let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = RelDirectory::des(&buf)?;
 
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -932,7 +976,7 @@ impl<'a> DatadirModification<'a> {
 
         // update logical size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
         self.pending_nblocks -= old_size as i64;
 
         // Remove enty from relation size cache
@@ -949,10 +993,11 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
@@ -988,10 +1033,15 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
+    pub async fn drop_slru_segment(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.remove(&segno) {
@@ -1015,9 +1065,13 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    pub async fn drop_twophase_file(
+        &mut self,
+        xid: TransactionId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
 
         if !dir.xids.remove(&xid) {
@@ -1111,7 +1165,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    async fn get(&self, key: Key) -> Result<Bytes, PageReconstructError> {
+    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1132,7 +1186,7 @@ impl<'a> DatadirModification<'a> {
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn).await
+            self.tline.get(key, lsn, ctx).await
         }
     }
 
@@ -1542,10 +1596,11 @@ pub fn create_test_timeline(
     tenant: &crate::tenant::Tenant,
     timeline_id: utils::id::TimelineId,
     pg_version: u32,
+    ctx: &RequestContext,
 ) -> anyhow::Result<std::sync::Arc<Timeline>> {
     let tline = tenant
-        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
-        .initialize()?;
+        .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
+        .initialize(ctx)?;
     let mut m = tline.begin_modification(Lsn(8));
     m.init_empty()?;
     m.commit()?;
@@ -1598,7 +1653,7 @@ mod tests {
         assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
 
         // Create a branch, check that the relation is visible there
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
+        repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?;
         let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
             Some(timeline) => timeline,
             None => panic!("Should have a local timeline"),
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 586fd20886..092503b7c5 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,17 @@ impl Key {
             | self.field6 as i128
     }
 
+    pub fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
     pub fn next(&self) -> Key {
         self.add(1)
     }
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 02e2e2ee14..09716ba0e0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -171,6 +171,9 @@ task_local! {
 ///
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum TaskKind {
+    // Pageserver startup, i.e., `main`
+    Startup,
+
     // libpq listener task. It just accepts connection and spawns a
     // PageRequestHandler task for each connection.
     LibpqEndpointListener,
@@ -183,13 +186,37 @@ pub enum TaskKind {
     // associated with one later, after receiving a command from the client.
     PageRequestHandler,
 
-    // Manages the WAL receiver connection for one timeline. It subscribes to
-    // events from storage_broker, decides which safekeeper to connect to. It spawns a
-    // separate WalReceiverConnection task to handle each connection.
+    /// Manages the WAL receiver connection for one timeline.
+    /// It subscribes to events from storage_broker and decides which safekeeper to connect to.
+    /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
+    /// There is at most one connection at any given time.
+    ///
+    /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
+    /// The `Client` object is what library users use to make requests & get responses.
+    /// Internally, `Client` hands over requests to the `Connection` object.
+    /// The `Connection` object is responsible for speaking the wire protocol.
+    ///
+    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// That abstraction doesn't use `task_mgr`.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
+    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
+    ///
+    /// Once the connection is established, the `TaskHandle` task creates a
+    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// the `Connection` object.
+    /// A `CancellationToken` created by the `TaskHandle` task ensures
+    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
     WalReceiverManager,
 
-    // Handles a connection to a safekeeper, to stream WAL to a timeline.
-    WalReceiverConnection,
+    /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
+    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionHandler,
+
+    /// The task that polls the `tokio-postgres::Connection` object.
+    /// Spawned by task [`WalReceiverConnectionHandler`].
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionPoller,
 
     // Garbage collection worker. One per tenant
     GarbageCollector,
@@ -200,6 +227,8 @@ pub enum TaskKind {
     // Initial logical size calculation
     InitialLogicalSizeCalculation,
 
+    OndemandLogicalSizeCalculation,
+
     // Task that flushes frozen in-memory layers to disk
     LayerFlushTask,
 
@@ -222,6 +251,12 @@ pub enum TaskKind {
     DownloadAllRemoteLayers,
     // Task that calculates synthetis size for all active tenants
     CalculateSyntheticSize,
+
+    // A request that comes in via the pageserver HTTP API.
+    MgmtRequest,
+
+    #[cfg(test)]
+    UnitTest,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c18c645e5b..2f45fe0dfc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,9 +48,10 @@ use std::time::{Duration, Instant};
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
+use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -174,7 +175,7 @@ impl UninitializedTimeline<'_> {
     ///
     /// The new timeline is initialized in Active state, and its background jobs are
     /// started
-    pub fn initialize(self) -> anyhow::Result<Arc<Timeline>> {
+    pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         self.initialize_with_lock(&mut timelines, true, true)
     }
@@ -188,7 +189,7 @@ impl UninitializedTimeline<'_> {
         mut self,
         timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
         load_layer_map: bool,
-        launch_wal_receiver: bool,
+        activate: bool,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_id = self.owning_tenant.tenant_id;
@@ -221,13 +222,12 @@ impl UninitializedTimeline<'_> {
                         "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
                     )
                 })?;
-                new_timeline.set_state(TimelineState::Active);
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
 
-                if launch_wal_receiver {
-                    new_timeline.launch_wal_receiver();
+                if activate {
+                    new_timeline.activate();
                 }
             }
         }
@@ -240,11 +240,12 @@ impl UninitializedTimeline<'_> {
         self,
         copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
         base_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
         let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
-        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx)
             .await
             .context("Failed to import basebackup")?;
 
@@ -262,9 +263,7 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        let timeline = self.initialize()?;
-
-        Ok(timeline)
+        self.initialize(ctx)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -450,6 +449,7 @@ impl Tenant {
     ///
     /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
     /// it is marked as Active.
+    #[allow(clippy::too_many_arguments)]
     async fn timeline_init_and_sync(
         &self,
         timeline_id: TimelineId,
@@ -458,6 +458,7 @@ impl Tenant {
         local_metadata: Option<TimelineMetadata>,
         ancestor: Option<Arc<Timeline>>,
         first_save: bool,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_id;
 
@@ -573,6 +574,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: GenericRemoteStorage,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         // XXX: Attach should provide the config, especially during tenant migration.
         //      See https://github.com/neondatabase/neon/issues/1555
@@ -591,6 +593,7 @@ impl Tenant {
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
@@ -599,7 +602,7 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
-                match tenant_clone.attach().await {
+                match tenant_clone.attach(ctx).await {
                     Ok(_) => {}
                     Err(e) => {
                         tenant_clone.set_broken(&e.to_string());
@@ -615,8 +618,8 @@ impl Tenant {
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn attach(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
         // Create directory with marker file to indicate attaching state.
         // The load_local_tenants() function in tenant::mgr relies on the marker file
         // to determine whether a tenant has finished attaching.
@@ -716,6 +719,7 @@ impl Tenant {
                 index_parts.remove(&timeline_id).unwrap(),
                 remote_metadata,
                 remote_clients.remove(&timeline_id).unwrap(),
+                &ctx,
             )
             .await
             .with_context(|| {
@@ -765,6 +769,7 @@ impl Tenant {
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
         remote_client: RemoteTimelineClient,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -799,6 +804,7 @@ impl Tenant {
             local_metadata,
             ancestor,
             true,
+            ctx,
         )
         .await
     }
@@ -827,11 +833,12 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     ///
-    #[instrument(skip(conf, remote_storage), fields(tenant_id=%tenant_id))]
+    #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
@@ -855,6 +862,7 @@ impl Tenant {
         // Do all the hard work in a background task
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::InitialLoad, DownloadBehavior::Warn);
         let _ = task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::InitialLoad,
@@ -863,7 +871,7 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                match tenant_clone.load().await {
+                match tenant_clone.load(&ctx).await {
                     Ok(()) => {}
                     Err(err) => {
                         tenant_clone.set_broken(&err.to_string());
@@ -884,8 +892,8 @@ impl Tenant {
     /// Background task to load in-memory data structures for this tenant, from
     /// files on disk. Used at pageserver startup.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn load(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         info!("loading tenant task");
 
         utils::failpoint_sleep_millis_async!("before-loading-tenant");
@@ -996,7 +1004,7 @@ impl Tenant {
         //    1. "Timeline has no ancestor and no layer files"
 
         for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata)
+            self.load_local_timeline(timeline_id, local_metadata, ctx)
                 .await
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
@@ -1013,11 +1021,12 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata), fields(timeline_id=%timeline_id))]
+    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
     async fn load_local_timeline(
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
             let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1061,6 +1070,7 @@ impl Tenant {
             Some(local_metadata),
             ancestor,
             false,
+            ctx,
         )
         .await
     }
@@ -1112,6 +1122,7 @@ impl Tenant {
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
         pg_version: u32,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<UninitializedTimeline> {
         anyhow::ensure!(
             self.is_active(),
@@ -1153,6 +1164,7 @@ impl Tenant {
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Option<Arc<Timeline>>> {
         anyhow::ensure!(
             self.is_active(),
@@ -1190,13 +1202,16 @@ impl Tenant {
                     // decoding the new WAL might need to look up previous pages, relation
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn).await?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                 }
 
-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
+                    .await?
+            }
+            None => {
+                self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
                     .await?
             }
-            None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
         };
 
         Ok(Some(loaded_timeline))
@@ -1220,30 +1235,25 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run GC iteration on inactive tenant"
         );
 
-        let timeline_str = target_timeline_id
-            .map(|x| x.to_string())
-            .unwrap_or_else(|| "-".to_string());
+        let gc_result = self
+            .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+            .await;
 
-        {
-            let _timer = STORAGE_TIME
-                .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
-                .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
-                .await
-        }
+        gc_result
     }
 
     /// Perform one compaction iteration.
     /// This function is periodically called by compactor task.
     /// Also it can be explicitly requested per timeline through page server
     /// api's 'compact' command.
-    pub async fn compaction_iteration(&self) -> anyhow::Result<()> {
+    pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run compaction iteration on inactive tenant"
@@ -1265,7 +1275,7 @@ impl Tenant {
 
         for (timeline_id, timeline) in &timelines_to_compact {
             timeline
-                .compact()
+                .compact(ctx)
                 .instrument(info_span!("compact_timeline", timeline = %timeline_id))
                 .await?;
         }
@@ -1298,7 +1308,11 @@ impl Tenant {
     }
 
     /// Removes timeline-related in-memory data
-    pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
+    pub async fn delete_timeline(
+        &self,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
         let timeline = {
@@ -1462,8 +1476,7 @@ impl Tenant {
                     tasks::start_background_loops(self.tenant_id);
 
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Active);
-                        timeline.launch_wal_receiver();
+                        timeline.activate();
                     }
                 }
             }
@@ -1487,7 +1500,7 @@ impl Tenant {
                         .values()
                         .filter(|timeline| timeline.current_state() != TimelineState::Broken);
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Suspended);
+                        timeline.set_state(TimelineState::Stopping);
                     }
                 }
                 TenantState::Broken => {
@@ -1717,7 +1730,33 @@ impl Tenant {
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
     ) -> Tenant {
-        let (state, _) = watch::channel(state);
+        let (state, mut rx) = watch::channel(state);
+
+        tokio::spawn(async move {
+            let current_state = *rx.borrow_and_update();
+            let tid = tenant_id.to_string();
+            TENANT_STATE_METRIC
+                .with_label_values(&[&tid, current_state.as_str()])
+                .inc();
+            loop {
+                match rx.changed().await {
+                    Ok(()) => {
+                        let new_state = *rx.borrow();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, current_state.as_str()])
+                            .dec();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, new_state.as_str()])
+                            .inc();
+                    }
+                    Err(_sender_dropped_error) => {
+                        info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
+                        return;
+                    }
+                }
+            }
+        });
+
         Tenant {
             tenant_id,
             conf,
@@ -1776,69 +1815,70 @@ impl Tenant {
     }
 
     pub(super) fn persist_tenant_config(
+        tenant_id: &TenantId,
         target_config_path: &Path,
         tenant_conf: TenantConfOpt,
-        first_save: bool,
+        creating_tenant: bool,
     ) -> anyhow::Result<()> {
         let _enter = info_span!("saving tenantconf").entered();
-        info!("persisting tenantconf to {}", target_config_path.display());
 
-        // TODO this will prepend comments endlessly ?
-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
-#  It is read in case of pageserver restart.
-
-[tenant_config]
-"#
-        .to_string();
-
-        // Convert the config to a toml file.
-        conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
-
-        let mut target_config_file = VirtualFile::open_with_options(
-            target_config_path,
-            OpenOptions::new()
-                .truncate(true) // This needed for overwriting with small config files
-                .write(true)
-                .create_new(first_save),
-        )?;
-
-        target_config_file
-            .write(conf_content.as_bytes())
-            .context("Failed to write toml bytes into file")
-            .and_then(|_| {
-                target_config_file
-                    .sync_all()
-                    .context("Faile to fsync config file")
-            })
-            .with_context(|| {
+        // imitate a try-block with a closure
+        let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
+            let target_config_parent = target_config_path.parent().with_context(|| {
                 format!(
-                    "Failed to write config file into path '{}'",
+                    "Config path does not have a parent: {}",
                     target_config_path.display()
                 )
             })?;
 
-        // fsync the parent directory to ensure the directory entry is durable
-        if first_save {
-            target_config_path
-                .parent()
-                .context("Config file does not have a parent")
-                .and_then(|target_config_parent| {
-                    File::open(target_config_parent).context("Failed to open config parent")
-                })
-                .and_then(|tenant_dir| {
-                    tenant_dir
-                        .sync_all()
-                        .context("Failed to fsync config parent")
-                })
-                .with_context(|| {
-                    format!(
-                        "Failed to fsync on first save for config {}",
-                        target_config_path.display()
-                    )
-                })?;
-        }
+            info!("persisting tenantconf to {}", target_config_path.display());
 
-        Ok(())
+            let mut conf_content = r#"# This file contains a specific per-tenant's config.
+#  It is read in case of pageserver restart.
+
+[tenant_config]
+"#
+            .to_string();
+
+            // Convert the config to a toml file.
+            conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
+
+            let mut target_config_file = VirtualFile::open_with_options(
+                target_config_path,
+                OpenOptions::new()
+                    .truncate(true) // This needed for overwriting with small config files
+                    .write(true)
+                    .create_new(creating_tenant)
+                    // when creating a new tenant, first_save will be true and `.create(true)` will be
+                    // ignored (per rust std docs).
+                    //
+                    // later when updating the config of created tenant, or persisting config for the
+                    // first time for attached tenant, the `.create(true)` is used.
+                    .create(true),
+            )?;
+
+            target_config_file
+                .write(conf_content.as_bytes())
+                .context("write toml bytes into file")
+                .and_then(|_| target_config_file.sync_all().context("fsync config file"))
+                .context("write config file")?;
+
+            // fsync the parent directory to ensure the directory entry is durable.
+            // before this was done conditionally on creating_tenant, but these management actions are rare
+            // enough to just fsync it always.
+
+            crashsafe::fsync(target_config_parent)?;
+            Ok(())
+        };
+
+        // this function is called from creating the tenant and updating the tenant config, which
+        // would otherwise share this context, so keep it here in one place.
+        do_persist(target_config_path).with_context(|| {
+            format!(
+                "write tenant {tenant_id} config to {}",
+                target_config_path.display()
+            )
+        })
     }
 
     //
@@ -1871,12 +1911,13 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
         let gc_timelines = self
-            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await?;
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
@@ -1917,7 +1958,10 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -1927,7 +1971,7 @@ impl Tenant {
         // refresh all timelines
         let target_timeline_id = None;
 
-        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await
     }
 
@@ -1936,6 +1980,7 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // grab mutex to prevent new timelines from being created here.
         let gc_cs = self.gc_cs.lock().await;
@@ -2007,7 +2052,9 @@ impl Tenant {
                     ))
                     .map(|&x| x.1)
                     .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
+                timeline
+                    .update_gc_info(branchpoints, cutoff, pitr, ctx)
+                    .await?;
 
                 gc_timelines.push(timeline);
             }
@@ -2019,53 +2066,53 @@ impl Tenant {
     /// Branch an existing timeline
     async fn branch_timeline(
         &self,
-        src: TimelineId,
-        dst: TimelineId,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
         start_lsn: Option<Lsn>,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
-        // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
-        // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().await;
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(dst, &timelines)?
-        };
-
-        // In order for the branch creation task to not wait for GC/compaction,
-        // we need to make sure that the starting LSN of the child branch is not out of scope midway by
-        //
-        // 1. holding the GC lock to prevent overwritting timeline's GC data
-        // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline
-        //
-        // Step 2 is to avoid initializing the new branch using data removed by past GC iterations
-        // or in-queue GC iterations.
-
-        let src_timeline = self.get_timeline(src, false).with_context(|| {
-            format!(
-                "No ancestor {} found for timeline {}/{}",
-                src, self.tenant_id, dst
-            )
-        })?;
-
-        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
+        let src_id = src_timeline.timeline_id;
 
         // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
         let start_lsn = start_lsn.unwrap_or_else(|| {
             let lsn = src_timeline.get_last_record_lsn();
-            info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
+            info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}");
             lsn
         });
 
-        // Check if the starting LSN is out of scope because it is less than
-        // 1. the latest GC cutoff LSN or
-        // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration.
+        // First acquire the GC lock so that another task cannot advance the GC
+        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+        // creating the branch.
+        let _gc_cs = self.gc_cs.lock().await;
+
+        // Create a placeholder for the new branch. This will error
+        // out if the new timeline ID is already in use.
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst_id, &timelines)?
+        };
+
+        // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
+        // horizon on the source timeline
+        //
+        // We check it against both the planned GC cutoff stored in 'gc_info',
+        // and the 'latest_gc_cutoff' of the last GC that was performed.  The
+        // planned GC cutoff in 'gc_info' is normally larger than
+        // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
+        // changed the GC settings for the tenant to make the PITR window
+        // larger, but some of the data was already removed by an earlier GC
+        // iteration.
+
+        // check against last actual 'latest_gc_cutoff' first
+        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
         src_timeline
             .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
             .context(format!(
                 "invalid branch start lsn: less than latest GC cutoff {}",
                 *latest_gc_cutoff_lsn,
             ))?;
+
+        // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
@@ -2076,6 +2123,12 @@ impl Tenant {
             }
         }
 
+        //
+        // The branch point is valid, and we are still holding the 'gc_cs' lock
+        // so that GC cannot advance the GC cutoff until we are finished.
+        // Proceed with the branch creation.
+        //
+
         // Determine prev-LSN for the new timeline. We can only determine it if
         // the timeline was branched at the current end of the source timeline.
         let RecordLsn {
@@ -2094,7 +2147,7 @@ impl Tenant {
         let metadata = TimelineMetadata::new(
             start_lsn,
             dst_prev,
-            Some(src),
+            Some(src_id),
             start_lsn,
             *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
             src_timeline.initdb_lsn,
@@ -2103,15 +2156,15 @@ impl Tenant {
         let mut timelines = self.timelines.lock().unwrap();
         let new_timeline = self
             .prepare_timeline(
-                dst,
+                dst_id,
                 metadata,
                 timeline_uninit_mark,
                 false,
-                Some(src_timeline),
+                Some(Arc::clone(src_timeline)),
             )?
             .initialize_with_lock(&mut timelines, true, true)?;
         drop(timelines);
-        info!("branched timeline {dst} from {src} at {start_lsn}");
+        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
 
         Ok(new_timeline)
     }
@@ -2122,6 +2175,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_uninit_mark = {
             let timelines = self.timelines.lock().unwrap();
@@ -2181,6 +2235,7 @@ impl Tenant {
             unfinished_timeline,
             pgdata_path,
             pgdata_lsn,
+            ctx,
         )
         .await
         .with_context(|| {
@@ -2352,7 +2407,10 @@ impl Tenant {
     ///
     /// Future is cancellation safe. Only one calculation can be running at once per tenant.
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
+    pub async fn gather_size_inputs(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
             .conf
             .concurrent_tenant_size_logical_size_queries
@@ -2364,15 +2422,15 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
     }
 
     /// Calculate synthetic tenant size
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs().await?;
+    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(ctx).await?;
 
         let size = inputs.calculate()?;
 
@@ -2475,26 +2533,19 @@ fn try_create_target_tenant_dir(
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
     let temporary_tenant_config_path = rebase_directory(
         &conf.tenant_config_path(tenant_id),
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
 
-    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
-        || {
-            format!(
-                "Failed to write tenant {} config to {}",
-                tenant_id,
-                temporary_tenant_config_path.display()
-            )
-        },
-    )?;
     crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
         format!(
-            "could not create tenant {} temporary timelines directory {}",
+            "create tenant {} temporary timelines directory {}",
             tenant_id,
             temporary_tenant_timelines_dir.display()
         )
@@ -2505,7 +2556,7 @@ fn try_create_target_tenant_dir(
 
     fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
-            "failed to move tenant {} temporary directory {} into the permanent one {}",
+            "move tenant {} temporary directory {} into the permanent one {}",
             tenant_id,
             temporary_tenant_dir.display(),
             target_tenant_directory.display()
@@ -2513,14 +2564,14 @@ fn try_create_target_tenant_dir(
     })?;
     let target_dir_parent = target_tenant_directory.parent().with_context(|| {
         format!(
-            "Failed to get tenant {} dir parent for {}",
+            "get tenant {} dir parent for {}",
             tenant_id,
             target_tenant_directory.display()
         )
     })?;
     crashsafe::fsync(target_dir_parent).with_context(|| {
         format!(
-            "Failed to fsync renamed directory's parent {} for tenant {}",
+            "fsync renamed directory's parent {} for tenant {}",
             target_dir_parent.display(),
             tenant_id,
         )
@@ -2743,11 +2794,17 @@ pub mod harness {
             })
         }
 
-        pub async fn load(&self) -> Arc<Tenant> {
-            self.try_load().await.expect("failed to load test tenant")
+        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+            (
+                self.try_load(&ctx)
+                    .await
+                    .expect("failed to load test tenant"),
+                ctx,
+            )
         }
 
-        pub async fn try_load(&self) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
             let walredo_mgr = Arc::new(TestRedoManager);
 
             let tenant = Arc::new(Tenant::new(
@@ -2775,8 +2832,7 @@ pub mod harness {
                 timelines_to_load.insert(timeline_id, timeline_metadata);
             }
             // FIXME starts background jobs
-            tenant.load().await?;
-
+            tenant.load(ctx).await?;
             Ok(tenant)
         }
 
@@ -2833,10 +2889,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_basic")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -2849,15 +2904,15 @@ mod tests {
         drop(writer);
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
 
@@ -2866,14 +2921,14 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("no_duplicate_timelines")?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
             .load()
             .await;
-        let _ = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let _ = timeline.initialize(&ctx)?;
 
-        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) {
+        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) {
             Ok(_) => panic!("duplicate timeline creation should fail"),
             Err(e) => assert_eq!(
                 e.to_string(),
@@ -2899,13 +2954,13 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_branch")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
-        let writer = tline.writer();
         use std::str::from_utf8;
 
+        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
+        let writer = tline.writer();
+
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
         #[allow(non_snake_case)]
@@ -2925,7 +2980,7 @@ mod tests {
 
         // Branch the history, modify relation differently on the new timeline
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -2936,15 +2991,15 @@ mod tests {
 
         // Check page contents on both branches
         assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "foo at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "bar at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?,
             "foobar at 0x20"
         );
 
@@ -2996,13 +3051,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let tenant =
+        let (tenant, ctx) =
             TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                 .load()
                 .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
@@ -3010,12 +3064,12 @@ mod tests {
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3034,16 +3088,17 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
-            .load()
-            .await;
+        let (tenant, ctx) =
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+                .load()
+                .await;
 
-        tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3085,40 +3140,40 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).await.is_ok());
+        assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());
 
         Ok(())
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3128,12 +3183,12 @@ mod tests {
 
         // run gc on parent
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // Check that the data is still accessible on the branch.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50)).await?,
+            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
             TEST_IMG(&format!("foo at {}", Lsn(0x40)))
         );
 
@@ -3145,14 +3200,14 @@ mod tests {
         const TEST_NAME: &str = "timeline_load";
         let harness = TenantHarness::create(TEST_NAME)?;
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
         }
 
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
         tenant
             .get_timeline(TIMELINE_ID, true)
             .expect("cannot load timeline");
@@ -3166,15 +3221,15 @@ mod tests {
         let harness = TenantHarness::create(TEST_NAME)?;
         // create two timelines
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
             tenant
-                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
                 .await?;
 
             let newtline = tenant
@@ -3185,7 +3240,7 @@ mod tests {
         }
 
         // check that both of them are initially unloaded
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
 
         // check that both, child and ancestor are loaded
         let _child_tline = tenant
@@ -3203,11 +3258,11 @@ mod tests {
     async fn corrupt_metadata() -> anyhow::Result<()> {
         const TEST_NAME: &str = "corrupt_metadata";
         let harness = TenantHarness::create(TEST_NAME)?;
-        let tenant = harness.load().await;
+        let (tenant, ctx) = harness.load().await;
 
         tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         drop(tenant);
 
         let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -3219,7 +3274,7 @@ mod tests {
         metadata_bytes[8] ^= 1;
         std::fs::write(metadata_path, metadata_bytes)?;
 
-        let err = harness.try_load().await.err().expect("should fail");
+        let err = harness.try_load(&ctx).await.err().expect("should fail");
         assert!(err
             .to_string()
             .starts_with("Failed to parse metadata bytes from path"));
@@ -3243,10 +3298,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_images")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -3254,7 +3308,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
@@ -3262,7 +3316,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
@@ -3270,7 +3324,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
@@ -3278,26 +3332,26 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x30)).await?,
+            tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
             TEST_IMG("foo at 0x30")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x40)).await?,
+            tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
             TEST_IMG("foo at 0x40")
         );
 
@@ -3310,10 +3364,9 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let mut lsn = Lsn(0x10);
 
@@ -3342,10 +3395,10 @@ mod tests {
             let cutoff = tline.get_last_record_lsn();
 
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3354,10 +3407,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_random_updates")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3407,7 +3459,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3415,10 +3467,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3427,12 +3479,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_branches")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3462,16 +3514,14 @@ mod tests {
             keyspace.add_key(test_key);
         }
 
-        let mut tline_id = TIMELINE_ID;
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3493,7 +3543,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3501,10 +3551,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3513,12 +3563,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_ancestors")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 100;
         const NUM_TLINES: usize = 50;
@@ -3528,18 +3578,16 @@ mod tests {
         let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
 
         let mut lsn = Lsn(0);
-        let mut tline_id = TIMELINE_ID;
 
         #[allow(clippy::needless_range_loop)]
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3568,7 +3616,7 @@ mod tests {
                 println!("checking [{idx}][{blknum}] at {lsn}");
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, *lsn).await?,
+                    tline.get(test_key, *lsn, &ctx).await?,
                     TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c95a98fbc7..e66ee0ae36 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -28,7 +28,12 @@ pub mod defaults {
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
 
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-    pub const DEFAULT_GC_PERIOD: &str = "100 s";
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 01c5359e88..ed1a32c8fd 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -9,24 +9,57 @@
 //! are frozen, and it is split up into new image and delta layers and the
 //! corresponding files are written to disk.
 //!
+//! Design overview:
+//!
+//! The `search` method of the layer map is on the read critical path, so we've
+//! built an efficient data structure for fast reads, stored in `LayerMap::historic`.
+//! Other read methods are less critical but still impact performance of background tasks.
+//!
+//! This data structure relies on a persistent/immutable binary search tree. See the
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
+//! you to modify the tree in such a way that each modification creates a new "version"
+//! of the tree. When you modify it, you get a new version, but all previous versions are
+//! still accessible too. So if someone is still holding a reference to an older version,
+//! they continue to see the tree as it was then. The persistent BST stores all the
+//! different versions in an efficient way.
+//!
+//! Our persistent BST maintains a map of which layer file "covers" each key. It has only
+//! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property
+//! to handle the LSN dimension.
+//!
+//! To build the layer map, we insert each layer to the persistent BST in LSN.start order,
+//! starting from the oldest one. After each insertion, we grab a reference to that "version"
+//! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See
+//! `historic_layer_coverage.rs`.
+//!
+//! To search for a particular key-LSN pair, you first look up the right "version" in the
+//! BTreeMap. Then you search that version of the BST with the key.
+//!
+//! The persistent BST keeps all the versions, but there is no way to change the old versions
+//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
+//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
+//! to throw away most of the persistent BST and build a new one, starting from the oldest
+//! LSN. See `LayerMap::flush_updates()`.
+//!
 
+mod historic_layer_coverage;
+mod layer_coverage;
+
+use crate::keyspace::KeyPartitioning;
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::storage_layer::{range_eq, range_overlaps};
-use amplify_num::i256;
+use crate::tenant::storage_layer::InMemoryLayer;
+use crate::tenant::storage_layer::Layer;
 use anyhow::Result;
-use num_traits::identities::{One, Zero};
-use num_traits::{Bounded, Num, Signed};
-use rstar::{RTree, RTreeObject, AABB};
-use std::cmp::Ordering;
 use std::collections::VecDeque;
 use std::ops::Range;
-use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
 use std::sync::Arc;
-use tracing::*;
 use utils::lsn::Lsn;
 
-use super::storage_layer::{InMemoryLayer, Layer};
+use historic_layer_coverage::BufferedHistoricLayerCoverage;
+
+use super::storage_layer::range_eq;
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -51,8 +84,8 @@ pub struct LayerMap<L: ?Sized> {
     ///
     pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
 
-    /// All the historic layers are kept here
-    historic_layers: RTree<LayerRTreeObject<L>>,
+    /// Index of the historic layers optimized for search
+    historic: BufferedHistoricLayerCoverage<Arc<L>>,
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
@@ -65,177 +98,64 @@ impl<L: ?Sized> Default for LayerMap<L> {
             open_layer: None,
             next_open_layer_at: None,
             frozen_layers: VecDeque::default(),
-            historic_layers: RTree::default(),
             l0_delta_layers: Vec::default(),
+            historic: BufferedHistoricLayerCoverage::default(),
         }
     }
 }
 
-struct LayerRTreeObject<L: ?Sized> {
-    layer: Arc<L>,
-
-    envelope: AABB<[IntKey; 2]>,
+/// The primary update API for the layer map.
+///
+/// Batching historic layer insertions and removals is good for
+/// performance and this struct helps us do that correctly.
+#[must_use]
+pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+    // While we hold this exclusive reference to the layer map the type checker
+    // will prevent us from accidentally reading any unflushed updates.
+    layer_map: &'a mut LayerMap<L>,
 }
 
-// Representation of Key as numeric type.
-// We can not use native implementation of i128, because rstar::RTree
-// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
-// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
-// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
-// By using i256 as the type, even though all the actual values would fit in i128, we can be
-// sure that multiplication doesn't overflow.
-//
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
-struct IntKey(i256);
-
-impl Copy for IntKey {}
-
-impl IntKey {
-    fn from(i: i128) -> Self {
-        IntKey(i256::from(i))
-    }
-}
-
-impl Bounded for IntKey {
-    fn min_value() -> Self {
-        IntKey(i256::MIN)
-    }
-    fn max_value() -> Self {
-        IntKey(i256::MAX)
-    }
-}
-
-impl Signed for IntKey {
-    fn is_positive(&self) -> bool {
-        self.0 > i256::ZERO
-    }
-    fn is_negative(&self) -> bool {
-        self.0 < i256::ZERO
-    }
-    fn signum(&self) -> Self {
-        match self.0.cmp(&i256::ZERO) {
-            Ordering::Greater => IntKey(i256::ONE),
-            Ordering::Less => IntKey(-i256::ONE),
-            Ordering::Equal => IntKey(i256::ZERO),
-        }
-    }
-    fn abs(&self) -> Self {
-        IntKey(self.0.abs())
-    }
-    fn abs_sub(&self, other: &Self) -> Self {
-        if self.0 <= other.0 {
-            IntKey(i256::ZERO)
-        } else {
-            IntKey(self.0 - other.0)
-        }
-    }
-}
-
-impl Neg for IntKey {
-    type Output = Self;
-    fn neg(self) -> Self::Output {
-        IntKey(-self.0)
-    }
-}
-
-impl Rem for IntKey {
-    type Output = Self;
-    fn rem(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 % rhs.0)
-    }
-}
-
-impl Div for IntKey {
-    type Output = Self;
-    fn div(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 / rhs.0)
-    }
-}
-
-impl Add for IntKey {
-    type Output = Self;
-    fn add(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 + rhs.0)
-    }
-}
-
-impl Sub for IntKey {
-    type Output = Self;
-    fn sub(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 - rhs.0)
-    }
-}
-
-impl Mul for IntKey {
-    type Output = Self;
-    fn mul(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 * rhs.0)
-    }
-}
-
-impl One for IntKey {
-    fn one() -> Self {
-        IntKey(i256::ONE)
-    }
-}
-
-impl Zero for IntKey {
-    fn zero() -> Self {
-        IntKey(i256::ZERO)
-    }
-    fn is_zero(&self) -> bool {
-        self.0 == i256::ZERO
-    }
-}
-
-impl Num for IntKey {
-    type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
-    fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
-        Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
-    }
-}
-
-impl<T: ?Sized> PartialEq for LayerRTreeObject<T> {
-    fn eq(&self, other: &Self) -> bool {
-        // FIXME: ptr_eq might fail to return true for 'dyn'
-        // references.  Clippy complains about this. In practice it
-        // seems to work, the assertion below would be triggered
-        // otherwise but this ought to be fixed.
-        #[allow(clippy::vtable_address_comparisons)]
-        Arc::ptr_eq(&self.layer, &other.layer)
-    }
-}
-
-impl<L> RTreeObject for LayerRTreeObject<L>
-where
-    L: ?Sized,
-{
-    type Envelope = AABB<[IntKey; 2]>;
-    fn envelope(&self) -> Self::Envelope {
-        self.envelope
-    }
-}
-
-impl<L> LayerRTreeObject<L>
+/// Provide ability to batch more updates while hiding the read
+/// API so we don't accidentally read without flushing.
+impl<L> BatchedUpdates<'_, L>
 where
     L: ?Sized + Layer,
 {
-    fn new(layer: Arc<L>) -> Self {
-        let key_range = layer.get_key_range();
-        let lsn_range = layer.get_lsn_range();
+    ///
+    /// Insert an on-disk layer.
+    ///
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer)
+    }
 
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
-        );
-        LayerRTreeObject { layer, envelope }
+    ///
+    /// Remove an on-disk layer from the map.
+    ///
+    /// This should be called when the corresponding file on disk has been deleted.
+    ///
+    pub fn remove_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer)
+    }
+
+    // We will flush on drop anyway, but this method makes it
+    // more explicit that there is some work being done.
+    /// Apply all updates
+    pub fn flush(self) {
+        // Flush happens on drop
+    }
+}
+
+// Ideally the flush() method should be called explicitly for more
+// controlled execution. But if we forget we'd rather flush on drop
+// than panic later or read without flushing.
+//
+// TODO maybe warn if flush hasn't explicitly been called
+impl<L> Drop for BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
+    fn drop(&mut self) {
+        self.layer_map.flush_updates();
     }
 }
 
@@ -281,125 +201,91 @@ where
     /// 'open' and 'frozen' layers!
     ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
-        // Find the latest image layer that covers the given key
-        let mut latest_img: Option<Arc<L>> = None;
-        let mut latest_img_lsn: Option<Lsn> = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0i128)],
-            [
-                IntKey::from(key.to_i128()),
-                IntKey::from(end_lsn.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-            assert!(l.get_key_range().contains(&key));
-            let img_lsn = l.get_lsn_range().start;
-            assert!(img_lsn < end_lsn);
-            if Lsn(img_lsn.0 + 1) == end_lsn {
-                // found exact match
-                return Some(SearchResult {
-                    layer: Arc::clone(l),
-                    lsn_floor: img_lsn,
-                });
-            }
-            if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
-                latest_img = Some(Arc::clone(l));
-                latest_img_lsn = Some(img_lsn);
-            }
-        }
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+        let latest_delta = version.delta_coverage.query(key.to_i128());
+        let latest_image = version.image_coverage.query(key.to_i128());
 
-        // Search the delta layers
-        let mut latest_delta: Option<Arc<L>> = None;
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
+        match (latest_delta, latest_image) {
+            (None, None) => None,
+            (None, Some(image)) => {
+                let lsn_floor = image.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: image,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_key_range().contains(&key));
-            if l.get_lsn_range().start >= end_lsn {
-                info!(
-                    "Candidate delta layer {}..{} is too new for lsn {}",
-                    l.get_lsn_range().start,
-                    l.get_lsn_range().end,
-                    end_lsn
-                );
+            (Some(delta), None) => {
+                let lsn_floor = delta.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: delta,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_lsn_range().start < end_lsn);
-            if l.get_lsn_range().end >= end_lsn {
-                // this layer contains the requested point in the key/lsn space.
-                // No need to search any further
-                trace!(
-                    "found layer {} for request on {key} at {end_lsn}",
-                    l.short_id(),
-                );
-                latest_delta.replace(Arc::clone(l));
-                break;
-            }
-            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
-                // this layer's end LSN is smaller than the requested point. If there's
-                // nothing newer, this is what we need to return. Remember this.
-                if let Some(old_candidate) = &latest_delta {
-                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
-                        latest_delta.replace(Arc::clone(l));
-                    }
+            (Some(delta), Some(image)) => {
+                let img_lsn = image.get_lsn_range().start;
+                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
+                let image_exact_match = img_lsn + 1 == end_lsn;
+                if image_is_newer || image_exact_match {
+                    Some(SearchResult {
+                        layer: image,
+                        lsn_floor: img_lsn,
+                    })
                 } else {
-                    latest_delta.replace(Arc::clone(l));
+                    let lsn_floor =
+                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    Some(SearchResult {
+                        layer: delta,
+                        lsn_floor,
+                    })
                 }
             }
         }
-        if let Some(l) = latest_delta {
-            trace!(
-                "found (old) layer {} for request on {key} at {end_lsn}",
-                l.short_id(),
-            );
-            let lsn_floor = std::cmp::max(
-                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
-                l.get_lsn_range().start,
-            );
-            Some(SearchResult {
-                lsn_floor,
-                layer: l,
-            })
-        } else if let Some(l) = latest_img {
-            trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Some(SearchResult {
-                lsn_floor: latest_img_lsn.unwrap(),
-                layer: l,
-            })
-        } else {
-            trace!("no layer found for request on {key} at {end_lsn}");
-            None
-        }
+    }
+
+    /// Start a batch of updates, applied on drop
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+        BatchedUpdates { layer_map: self }
     }
 
     ///
     /// Insert an on-disk layer
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
-            self.l0_delta_layers.push(layer.clone());
+    /// Helper function for BatchedUpdates::insert_historic
+    ///
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.insert(
+            historic_layer_coverage::LayerKey {
+                key: kr.start.to_i128()..kr.end.to_i128(),
+                lsn: lr.start.0..lr.end.0,
+                is_image: !layer.is_incremental(),
+            },
+            Arc::clone(&layer),
+        );
+
+        if Self::is_l0(&layer) {
+            self.l0_delta_layers.push(layer);
         }
-        self.historic_layers.insert(LayerRTreeObject::new(layer));
+
         NUM_ONDISK_LAYERS.inc();
     }
 
     ///
     /// Remove an on-disk layer from the map.
     ///
-    /// This should be called when the corresponding file on disk has been deleted.
+    /// Helper function for BatchedUpdates::remove_historic
     ///
-    pub fn remove_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
+    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.remove(historic_layer_coverage::LayerKey {
+            key: kr.start.to_i128()..kr.end.to_i128(),
+            lsn: lr.start.0..lr.end.0,
+            is_image: !layer.is_incremental(),
+        });
+
+        if Self::is_l0(&layer) {
             let len_before = self.l0_delta_layers.len();
 
             // FIXME: ptr_eq might fail to return true for 'dyn'
@@ -411,98 +297,57 @@ where
                 .retain(|other| !Arc::ptr_eq(other, &layer));
             assert_eq!(self.l0_delta_layers.len(), len_before - 1);
         }
-        assert!(self
-            .historic_layers
-            .remove(&LayerRTreeObject::new(layer))
-            .is_some());
+
         NUM_ONDISK_LAYERS.dec();
     }
 
+    /// Helper function for BatchedUpdates::drop.
+    pub(self) fn flush_updates(&mut self) {
+        self.historic.rebuild();
+    }
+
     /// Is there a newer image layer for given key- and LSN-range? Or a set
     /// of image layers within the specified lsn range that cover the entire
     /// specified key range?
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
-    pub fn image_layer_exists(
-        &self,
-        key_range: &Range<Key>,
-        lsn_range: &Range<Lsn>,
-    ) -> Result<bool> {
-        let mut range_remain = key_range.clone();
+    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
+        if key.is_empty() {
+            // Vacuously true. There's a newer image for all 0 of the kerys in the range.
+            return Ok(true);
+        }
 
-        loop {
-            let mut made_progress = false;
-            let envelope = AABB::from_corners(
-                [
-                    IntKey::from(range_remain.start.to_i128()),
-                    IntKey::from(lsn_range.start.0 as i128),
-                ],
-                [
-                    IntKey::from(range_remain.end.to_i128() - 1),
-                    IntKey::from(lsn_range.end.0 as i128 - 1),
-                ],
-            );
-            for e in self
-                .historic_layers
-                .locate_in_envelope_intersecting(&envelope)
-            {
-                let l = &e.layer;
-                if l.is_incremental() {
-                    continue;
-                }
-                let img_lsn = l.get_lsn_range().start;
-                if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) {
-                    made_progress = true;
-                    let img_key_end = l.get_key_range().end;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(false),
+        };
 
-                    if img_key_end >= range_remain.end {
-                        return Ok(true);
-                    }
-                    range_remain.start = img_key_end;
-                }
-            }
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
 
-            if !made_progress {
+        let layer_covers = |layer: Option<Arc<L>>| match layer {
+            Some(layer) => layer.get_lsn_range().start >= lsn.start,
+            None => false,
+        };
+
+        // Check the start is covered
+        if !layer_covers(version.image_coverage.query(start)) {
+            return Ok(false);
+        }
+
+        // Check after all changes of coverage
+        for (_, change_val) in version.image_coverage.range(start..end) {
+            if !layer_covers(change_val) {
                 return Ok(false);
             }
         }
+
+        Ok(true)
     }
 
     pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic_layers.iter().map(|e| e.layer.clone())
-    }
-
-    /// Find the last image layer that covers 'key', ignoring any image layers
-    /// newer than 'lsn'.
-    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<L>> {
-        let mut candidate_lsn = Lsn(0);
-        let mut candidate = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0)],
-            [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-
-            assert!(l.get_key_range().contains(&key));
-            let this_lsn = l.get_lsn_range().start;
-            assert!(this_lsn <= lsn);
-            if this_lsn < candidate_lsn {
-                // our previous candidate was better
-                continue;
-            }
-            candidate_lsn = this_lsn;
-            candidate = Some(Arc::clone(l));
-        }
-
-        candidate
+        self.historic.iter()
     }
 
     ///
@@ -518,94 +363,288 @@ where
         key_range: &Range<Key>,
         lsn: Lsn,
     ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
-        let mut points = vec![key_range.start];
-        let envelope = AABB::from_corners(
-            [IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
-            [
-                IntKey::from(key_range.end.to_i128()),
-                IntKey::from(lsn.0 as i128),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            assert!(l.get_lsn_range().start <= lsn);
-            let range = l.get_key_range();
-            if key_range.contains(&range.start) {
-                points.push(l.get_key_range().start);
-            }
-            if key_range.contains(&range.end) {
-                points.push(l.get_key_range().end);
-            }
+        let version = match self.historic.get().unwrap().get_version(lsn.0) {
+            Some(v) => v,
+            None => return Ok(vec![]),
+        };
+
+        let start = key_range.start.to_i128();
+        let end = key_range.end.to_i128();
+
+        // Initialize loop variables
+        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut current_key = start;
+        let mut current_val = version.image_coverage.query(start);
+
+        // Loop through the change events and push intervals
+        for (change_key, change_val) in version.image_coverage.range(start..end) {
+            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+            coverage.push((kr, current_val.take()));
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        points.push(key_range.end);
 
-        points.sort();
-        points.dedup();
+        // Add the final interval
+        let kr = Key::from_i128(current_key)..Key::from_i128(end);
+        coverage.push((kr, current_val.take()));
 
-        // Ok, we now have a list of "interesting" points in the key space
-
-        // For each range between the points, find the latest image
-        let mut start = *points.first().unwrap();
-        let mut ranges = Vec::new();
-        for end in points[1..].iter() {
-            let img = self.find_latest_image(start, lsn);
-
-            ranges.push((start..*end, img));
-
-            start = *end;
-        }
-        Ok(ranges)
+        Ok(coverage)
     }
 
-    /// Count the height of the tallest stack of deltas in this 2d region.
+    pub fn is_l0(layer: &L) -> bool {
+        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
+    }
+
+    /// This function determines which layers are counted in `count_deltas`:
+    /// layers that should count towards deciding whether or not to reimage
+    /// a certain partition range.
+    ///
+    /// There are two kinds of layers we currently consider reimage-worthy:
+    ///
+    /// Case 1: Non-L0 layers are currently reimage-worthy by default.
+    /// TODO Some of these layers are very sparse and cover the entire key
+    ///      range. Replacing 256MB of data (or less!) with terabytes of
+    ///      images doesn't seem wise. We need a better heuristic, possibly
+    ///      based on some of these factors:
+    ///      a) whether this layer has any wal in this partition range
+    ///      b) the size of the layer
+    ///      c) the number of images needed to cover it
+    ///      d) the estimated time until we'll have to reimage over it for GC
+    ///
+    /// Case 2: Since L0 layers by definition cover the entire key space, we consider
+    /// them reimage-worthy only when the entire key space can be covered by very few
+    /// images (currently 1).
+    /// TODO The optimal number should probably be slightly higher than 1, but to
+    ///      implement that we need to plumb a lot more context into this function
+    ///      than just the current partition_range.
+    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+        // Case 1
+        if !Self::is_l0(layer) {
+            return true;
+        }
+
+        // Case 2
+        if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
+            return true;
+        }
+
+        false
+    }
+
+    /// Count the height of the tallest stack of reimage-worthy deltas
+    /// in this 2d region.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
     ///
     /// This number is used to compute the largest number of deltas that
     /// we'll need to visit for any page reconstruction in this region.
     /// We use this heuristic to decide whether to create an image layer.
-    ///
-    /// TODO currently we just return the total number of deltas in the
-    ///      region, no matter if they're stacked on top of each other
-    ///      or next to each other.
-    pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
-        let mut result = 0;
-        if lsn_range.start >= lsn_range.end {
+    pub fn count_deltas(
+        &self,
+        key: &Range<Key>,
+        lsn: &Range<Lsn>,
+        limit: Option<usize>,
+    ) -> Result<usize> {
+        // We get the delta coverage of the region, and for each part of the coverage
+        // we recurse right underneath the delta. The recursion depth is limited by
+        // the largest result this function could return, which is in practice between
+        // 3 and 10 (since we usually try to create an image when the number gets larger).
+
+        if lsn.is_empty() || key.is_empty() || limit == Some(0) {
             return Ok(0);
         }
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
-            }
-            assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
-            assert!(range_overlaps(&l.get_key_range(), key_range));
 
-            // We ignore level0 delta layers. Unless the whole keyspace fits
-            // into one partition
-            if !range_eq(key_range, &(Key::MIN..Key::MAX))
-                && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
-            {
-                continue;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(0),
+        };
+
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
+
+        // Initialize loop variables
+        let mut max_stacked_deltas = 0;
+        let mut current_key = start;
+        let mut current_val = version.delta_coverage.query(start);
+
+        // Loop through the delta coverage and recurse on each part
+        for (change_key, change_val) in version.delta_coverage.range(start..end) {
+            // If there's a relevant delta in this part, add 1 and recurse down
+            if let Some(val) = current_val {
+                if val.get_lsn_range().end > lsn.start {
+                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+                    let lr = lsn.start..val.get_lsn_range().start;
+                    if !kr.is_empty() {
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let new_limit = limit.map(|l| l - base_count);
+                        let max_stacked_deltas_underneath =
+                            self.count_deltas(&kr, &lr, new_limit)?;
+                        max_stacked_deltas = std::cmp::max(
+                            max_stacked_deltas,
+                            base_count + max_stacked_deltas_underneath,
+                        );
+                    }
+                }
             }
 
-            result += 1;
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        Ok(result)
+
+        // Consider the last part
+        if let Some(val) = current_val {
+            if val.get_lsn_range().end > lsn.start {
+                let kr = Key::from_i128(current_key)..Key::from_i128(end);
+                let lr = lsn.start..val.get_lsn_range().start;
+
+                if !kr.is_empty() {
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let new_limit = limit.map(|l| l - base_count);
+                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+                    max_stacked_deltas = std::cmp::max(
+                        max_stacked_deltas,
+                        base_count + max_stacked_deltas_underneath,
+                    );
+                }
+            }
+        }
+
+        Ok(max_stacked_deltas)
+    }
+
+    /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
+    ///
+    /// The `partition_range` argument is used as context for the reimage-worthiness decision.
+    ///
+    /// Used as a helper for correctness checks only. Performance not critical.
+    pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
+        match self.search(key, lsn) {
+            Some(search_result) => {
+                if search_result.layer.is_incremental() {
+                    (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
+                        + self.get_difficulty(search_result.lsn_floor, key, partition_range)
+                } else {
+                    0
+                }
+            }
+            None => 0,
+        }
+    }
+
+    /// Used for correctness checking. Results are expected to be identical to
+    /// self.get_difficulty_map. Assumes self.search is correct.
+    pub fn get_difficulty_map_bruteforce(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+    ) -> Vec<usize> {
+        // Looking at the difficulty as a function of key, it could only increase
+        // when a delta layer starts or an image layer ends. Therefore it's sufficient
+        // to check the difficulties at:
+        // - the key.start for each non-empty part range
+        // - the key.start for each delta
+        // - the key.end for each image
+        let keys_iter: Box<dyn Iterator<Item = Key>> = {
+            let mut keys: Vec<Key> = self
+                .iter_historic_layers()
+                .map(|layer| {
+                    if layer.is_incremental() {
+                        layer.get_key_range().start
+                    } else {
+                        layer.get_key_range().end
+                    }
+                })
+                .collect();
+            keys.sort();
+            Box::new(keys.into_iter())
+        };
+        let mut keys_iter = keys_iter.peekable();
+
+        // Iter the partition and keys together and query all the necessary
+        // keys, computing the max difficulty for each part.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                // Partition ranges are assumed to be sorted and disjoint
+                // TODO assert it
+                for range in &part.ranges {
+                    if !range.is_empty() {
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
+                    }
+                    while let Some(key) = keys_iter.peek() {
+                        if key >= &range.end {
+                            break;
+                        }
+                        let key = keys_iter.next().unwrap();
+                        if key < range.start {
+                            continue;
+                        }
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
+                    }
+                }
+                difficulty
+            })
+            .collect()
+    }
+
+    /// For each part of a keyspace partitioning, return the maximum number of layers
+    /// that would be needed for page reconstruction in that part at the given LSN.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
+    ///
+    /// This method is used to decide where to create new image layers. Computing the
+    /// result for the entire partitioning at once allows this function to be more
+    /// efficient, and further optimization is possible by using iterators instead,
+    /// to allow early return.
+    ///
+    /// TODO actually use this method instead of count_deltas. Currently we only use
+    ///      it for benchmarks.
+    pub fn get_difficulty_map(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+        limit: Option<usize>,
+    ) -> Vec<usize> {
+        // TODO This is a naive implementation. Perf improvements to do:
+        // 1. Instead of calling self.image_coverage and self.count_deltas,
+        //    iterate the image and delta coverage only once.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                for range in &part.ranges {
+                    if limit == Some(difficulty) {
+                        break;
+                    }
+                    for (img_range, last_img) in self
+                        .image_coverage(range, lsn)
+                        .expect("why would this err?")
+                    {
+                        if limit == Some(difficulty) {
+                            break;
+                        }
+                        let img_lsn = if let Some(last_img) = last_img {
+                            last_img.get_lsn_range().end
+                        } else {
+                            Lsn(0)
+                        };
+
+                        if img_lsn < lsn {
+                            let num_deltas = self
+                                .count_deltas(&img_range, &(img_lsn..lsn), limit)
+                                .expect("why would this err lol?");
+                            difficulty = std::cmp::max(difficulty, num_deltas);
+                        }
+                    }
+                }
+                difficulty
+            })
+            .collect()
     }
 
     /// Return all L0 delta layers
@@ -629,8 +668,8 @@ where
         }
 
         println!("historic_layers:");
-        for e in self.historic_layers.iter() {
-            e.layer.dump(verbose)?;
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose)?;
         }
         println!("End dump LayerMap");
         Ok(())
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
new file mode 100644
index 0000000000..46821aef15
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -0,0 +1,583 @@
+use std::collections::BTreeMap;
+use std::ops::Range;
+
+use tracing::info;
+
+use super::layer_coverage::LayerCoverageTuple;
+
+/// Layers in this module are identified and indexed by this data.
+///
+/// This is a helper struct to enable sorting layers by lsn.start.
+///
+/// These three values are enough to uniquely identify a layer, since
+/// a layer is obligated to contain all contents within range, so two
+/// deltas (or images) with the same range have identical content.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct LayerKey {
+    // TODO I use i128 and u64 because it was easy for prototyping,
+    //      testing, and benchmarking. If we can use the Lsn and Key
+    //      types without overhead that would be preferable.
+    pub key: Range<i128>,
+    pub lsn: Range<u64>,
+    pub is_image: bool,
+}
+
+impl PartialOrd for LayerKey {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for LayerKey {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // NOTE we really care about comparing by lsn.start first
+        self.lsn
+            .start
+            .cmp(&other.lsn.start)
+            .then(self.lsn.end.cmp(&other.lsn.end))
+            .then(self.key.start.cmp(&other.key.start))
+            .then(self.key.end.cmp(&other.key.end))
+            .then(self.is_image.cmp(&other.is_image))
+    }
+}
+
+/// Efficiently queryable layer coverage for each LSN.
+///
+/// Allows answering layer map queries very efficiently,
+/// but doesn't allow retroactive insertion, which is
+/// sometimes necessary. See BufferedHistoricLayerCoverage.
+pub struct HistoricLayerCoverage<Value> {
+    /// The latest state
+    head: LayerCoverageTuple<Value>,
+
+    /// All previous states
+    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
+}
+
+impl<T: Clone> Default for HistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> HistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            head: LayerCoverageTuple::default(),
+            historic: BTreeMap::default(),
+        }
+    }
+
+    /// Add a layer
+    ///
+    /// Panics if new layer has older lsn.start than an existing layer.
+    /// See BufferedHistoricLayerCoverage for a more general insertion method.
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        // It's only a persistent map, not a retroactive one
+        if let Some(last_entry) = self.historic.iter().next_back() {
+            let last_lsn = last_entry.0;
+            if layer_key.lsn.start < *last_lsn {
+                panic!("unexpected retroactive insert");
+            }
+        }
+
+        // Insert into data structure
+        if layer_key.is_image {
+            self.head
+                .image_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        } else {
+            self.head
+                .delta_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        }
+
+        // Remember history. Clone is O(1)
+        self.historic.insert(layer_key.lsn.start, self.head.clone());
+    }
+
+    /// Query at a particular LSN, inclusive
+    pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple<Value>> {
+        match self.historic.range(..=lsn).next_back() {
+            Some((_, v)) => Some(v),
+            None => None,
+        }
+    }
+
+    /// Remove all entries after a certain LSN (inclusive)
+    pub fn trim(&mut self, begin: &u64) {
+        self.historic.split_off(begin);
+        self.head = self
+            .historic
+            .iter()
+            .rev()
+            .next()
+            .map(|(_, v)| v.clone())
+            .unwrap_or_default();
+    }
+}
+
+/// This is the most basic test that demonstrates intended usage.
+/// All layers in this test have height 1.
+#[test]
+fn test_persistent_simple() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // After Layer 1 insertion
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // After Layer 2 insertion
+    let version = map.get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(11), None);
+
+    // After Layer 3 insertion
+    let version = map.get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
+}
+
+/// Cover simple off-by-one edge cases
+#[test]
+fn test_off_by_one() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+
+    // Check different LSNs
+    let version = map.get_version(99);
+    assert!(version.is_none());
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    let version = map.get_version(110).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // Check different keys
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(5), None);
+}
+
+/// Cover edge cases where layers begin or end on the same key
+#[test]
+fn test_key_collision() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 10".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..8,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 11".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..4,
+            lsn: 200..210,
+            is_image: true,
+        },
+        "Layer 20".to_string(),
+    );
+
+    // Check after layer 11
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 10".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+
+    // Check after layer 20
+    let version = map.get_version(205).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 20".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+}
+
+/// Test when rectangles have nontrivial height and possibly overlap
+#[test]
+fn test_persistent_overlapping() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    // Add 3 key-disjoint layers with varying LSN ranges
+    map.insert(
+        LayerKey {
+            key: 1..2,
+            lsn: 100..200,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..5,
+            lsn: 110..200,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 7..8,
+            lsn: 120..300,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // Add wide and short layer
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 130..199,
+            is_image: true,
+        },
+        "Layer 4".to_string(),
+    );
+
+    // Add wide layer taller than some
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 140..201,
+            is_image: true,
+        },
+        "Layer 5".to_string(),
+    );
+
+    // Add wide layer taller than all
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 150..301,
+            is_image: true,
+        },
+        "Layer 6".to_string(),
+    );
+
+    // After layer 4 insertion
+    let version = map.get_version(135).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
+
+    // After layer 5 insertion
+    let version = map.get_version(145).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
+
+    // After layer 6 insertion
+    let version = map.get_version(155).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
+}
+
+/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
+/// of support for retroactive insertion by rebuilding the map since the
+/// change.
+///
+/// Why is this needed? We most often insert new layers with newer LSNs,
+/// but during compaction we create layers with non-latest LSN, and during
+/// GC we delete historic layers.
+///
+/// Even though rebuilding is an expensive (N log N) solution to the problem,
+/// it's not critical since we do something equally expensive just to decide
+/// whether or not to create new image layers.
+/// TODO It's not expensive but it's not great to hold a layer map write lock
+///      for that long.
+///
+/// If this becomes an actual bottleneck, one solution would be to build a
+/// segment tree that holds PersistentLayerMaps. Though this would mean that
+/// we take an additional log(N) performance hit for queries, which will probably
+/// still be more critical.
+///
+/// See this for more on persistent and retroactive techniques:
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+pub struct BufferedHistoricLayerCoverage<Value> {
+    /// A persistent layer map that we rebuild when we need to retroactively update
+    historic_coverage: HistoricLayerCoverage<Value>,
+
+    /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds.
+    buffer: BTreeMap<LayerKey, Option<Value>>,
+
+    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    layers: BTreeMap<LayerKey, Value>,
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for BufferedHistoricLayerCoverage<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RetroactiveLayerMap")
+            .field("buffer", &self.buffer)
+            .field("layers", &self.layers)
+            .finish()
+    }
+}
+
+impl<T: Clone> Default for BufferedHistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            historic_coverage: HistoricLayerCoverage::<Value>::new(),
+            buffer: BTreeMap::new(),
+            layers: BTreeMap::new(),
+        }
+    }
+
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        self.buffer.insert(layer_key, Some(value));
+    }
+
+    pub fn remove(&mut self, layer_key: LayerKey) {
+        self.buffer.insert(layer_key, None);
+    }
+
+    pub fn rebuild(&mut self) {
+        // Find the first LSN that needs to be rebuilt
+        let rebuild_since: u64 = match self.buffer.iter().next() {
+            Some((LayerKey { lsn, .. }, _)) => lsn.start,
+            None => return, // No need to rebuild if buffer is empty
+        };
+
+        // Apply buffered updates to self.layers
+        let num_updates = self.buffer.len();
+        self.buffer.retain(|layer_key, layer| {
+            match layer {
+                Some(l) => {
+                    self.layers.insert(layer_key.clone(), l.clone());
+                }
+                None => {
+                    self.layers.remove(layer_key);
+                }
+            };
+            false
+        });
+
+        // Rebuild
+        let mut num_inserted = 0;
+        self.historic_coverage.trim(&rebuild_since);
+        for (layer_key, layer) in self.layers.range(
+            LayerKey {
+                lsn: rebuild_since..0,
+                key: 0..0,
+                is_image: false,
+            }..,
+        ) {
+            self.historic_coverage
+                .insert(layer_key.clone(), layer.clone());
+            num_inserted += 1;
+        }
+
+        // TODO maybe only warn if ratio is at least 10
+        info!(
+            "Rebuilt layer map. Did {} insertions to process a batch of {} updates.",
+            num_inserted, num_updates,
+        )
+    }
+
+    /// Iterate all the layers
+    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+        // NOTE we can actually perform this without rebuilding,
+        //      but it's not necessary for now.
+        if !self.buffer.is_empty() {
+            panic!("rebuild pls")
+        }
+
+        self.layers.values().cloned()
+    }
+
+    /// Return a reference to a queryable map, assuming all updates
+    /// have already been processed using self.rebuild()
+    pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
+        // NOTE we error here instead of implicitly rebuilding because
+        //      rebuilding is somewhat expensive.
+        // TODO maybe implicitly rebuild and log/sentry an error?
+        if !self.buffer.is_empty() {
+            anyhow::bail!("rebuild required")
+        }
+
+        Ok(&self.historic_coverage)
+    }
+}
+
+#[test]
+fn test_retroactive_regression_1() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    map.insert(
+        LayerKey {
+            key: 0..21267647932558653966460912964485513215,
+            lsn: 23761336..23761457,
+            is_image: false,
+        },
+        "sdfsdfs".to_string(),
+    );
+
+    map.rebuild();
+
+    let version = map.get().unwrap().get_version(23761457).unwrap();
+    assert_eq!(
+        version.delta_coverage.query(100),
+        Some("sdfsdfs".to_string())
+    );
+}
+
+#[test]
+fn test_retroactive_simple() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    // Append some images in increasing LSN order
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Image 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Image 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 3".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 8..9,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 4".to_string(),
+    );
+
+    // Add a delta layer out of order
+    map.insert(
+        LayerKey {
+            key: 2..5,
+            lsn: 105..106,
+            is_image: true,
+        },
+        "Delta 1".to_string(),
+    );
+
+    // Rebuild so we can start querying
+    map.rebuild();
+
+    // Query key 4
+    let version = map.get().unwrap().get_version(90);
+    assert!(version.is_none());
+    let version = map.get().unwrap().get_version(102).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string()));
+    let version = map.get().unwrap().get_version(107).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string()));
+    let version = map.get().unwrap().get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string()));
+
+    // Remove Image 3
+    map.remove(LayerKey {
+        key: 4..6,
+        lsn: 120..121,
+        is_image: true,
+    });
+    map.rebuild();
+
+    // Check deletion worked
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
+}
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
new file mode 100644
index 0000000000..4e3b4516dc
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -0,0 +1,154 @@
+use std::ops::Range;
+
+// TODO the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. It also runs a bit faster but
+// results are not the same on some tests.
+use rpds::RedBlackTreeMapSync;
+
+/// Data structure that can efficiently:
+/// - find the latest layer by lsn.end at a given key
+/// - iterate the latest layers in a key range
+/// - insert layers in non-decreasing lsn.start order
+///
+/// The struct is parameterized over Value for easier
+/// testing, but in practice it's some sort of layer.
+pub struct LayerCoverage<Value> {
+    /// For every change in coverage (as we sweep the key space)
+    /// we store (lsn.end, value).
+    ///
+    /// We use an immutable/persistent tree so that we can keep historic
+    /// versions of this coverage without cloning the whole thing and
+    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    ///
+    /// We use the Sync version of the map because we want Self to
+    /// be Sync. Using nonsync might be faster, if we can work with
+    /// that.
+    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
+}
+
+impl<T: Clone> Default for LayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> LayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            nodes: RedBlackTreeMapSync::default(),
+        }
+    }
+
+    /// Helper function to subdivide the key range without changing any values
+    ///
+    /// Complexity: O(log N)
+    fn add_node(&mut self, key: i128) {
+        let value = match self.nodes.range(..=key).last() {
+            Some((_, Some(v))) => Some(v.clone()),
+            Some((_, None)) => None,
+            None => None,
+        };
+        self.nodes.insert_mut(key, value);
+    }
+
+    /// Insert a layer.
+    ///
+    /// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation.
+    pub fn insert(&mut self, key: Range<i128>, lsn: Range<u64>, value: Value) {
+        // Add nodes at endpoints
+        //
+        // NOTE The order of lines is important. We add nodes at the start
+        // and end of the key range **before updating any nodes** in order
+        // to pin down the current coverage outside of the relevant key range.
+        // Only the coverage inside the layer's key range should change.
+        self.add_node(key.start);
+        self.add_node(key.end);
+
+        // Raise the height where necessary
+        //
+        // NOTE This loop is worst case O(N), but amortized O(log N) in the special
+        // case when rectangles have no height. In practice I don't think we'll see
+        // the kind of layer intersections needed to trigger O(N) behavior. The worst
+        // case is N/2 horizontal layers overlapped with N/2 vertical layers in a
+        // grid pattern.
+        let mut to_update = Vec::new();
+        let mut to_remove = Vec::new();
+        let mut prev_covered = false;
+        for (k, node) in self.nodes.range(key.clone()) {
+            let needs_cover = match node {
+                None => true,
+                Some((h, _)) => h < &lsn.end,
+            };
+            if needs_cover {
+                match prev_covered {
+                    true => to_remove.push(*k),
+                    false => to_update.push(*k),
+                }
+            }
+            prev_covered = needs_cover;
+        }
+        if !prev_covered {
+            to_remove.push(key.end);
+        }
+        for k in to_update {
+            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
+        }
+        for k in to_remove {
+            self.nodes.remove_mut(&k);
+        }
+    }
+
+    /// Get the latest (by lsn.end) layer at a given key
+    ///
+    /// Complexity: O(log N)
+    pub fn query(&self, key: i128) -> Option<Value> {
+        self.nodes
+            .range(..=key)
+            .rev()
+            .next()?
+            .1
+            .as_ref()
+            .map(|(_, v)| v.clone())
+    }
+
+    /// Iterate the changes in layer coverage in a given range. You will likely
+    /// want to start with self.query(key.start), and then follow up with self.range
+    ///
+    /// Complexity: O(log N + result_size)
+    pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
+        self.nodes
+            .range(key)
+            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
+    }
+
+    /// O(1) clone
+    pub fn clone(&self) -> Self {
+        Self {
+            nodes: self.nodes.clone(),
+        }
+    }
+}
+
+/// Image and delta coverage at a specific LSN.
+pub struct LayerCoverageTuple<Value> {
+    pub image_coverage: LayerCoverage<Value>,
+    pub delta_coverage: LayerCoverage<Value>,
+}
+
+impl<T: Clone> Default for LayerCoverageTuple<T> {
+    fn default() -> Self {
+        Self {
+            image_coverage: LayerCoverage::default(),
+            delta_coverage: LayerCoverage::default(),
+        }
+    }
+}
+
+impl<Value: Clone> LayerCoverageTuple<Value> {
+    pub fn clone(&self) -> Self {
+        Self {
+            image_coverage: self.image_coverage.clone(),
+            delta_coverage: self.delta_coverage.clone(),
+        }
+    }
+}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index dce7cd8bae..a9edee3794 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -16,6 +16,7 @@ use remote_storage::GenericRemoteStorage;
 use utils::crashsafe;
 
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
@@ -24,8 +25,35 @@ use crate::IGNORED_TENANT_FILE_NAME;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
-static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
-    Lazy::new(|| RwLock::new(HashMap::new()));
+/// The tenants known to the pageserver.
+/// The enum variants are used to distinguish the different states that the pageserver can be in.
+enum TenantsMap {
+    /// [`init_tenant_mgr`] is not done yet.
+    Initializing,
+    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
+    /// New tenants can be added using [`tenant_map_insert`].
+    Open(HashMap<TenantId, Arc<Tenant>>),
+    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// Existing tenants are still accessible, but no new tenants can be created.
+    ShuttingDown(HashMap<TenantId, Arc<Tenant>>),
+}
+
+impl TenantsMap {
+    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
+        }
+    }
+    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
+        }
+    }
+}
+
+static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
 
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
@@ -36,13 +64,16 @@ pub async fn init_tenant_mgr(
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
-    let mut number_of_tenants = 0;
     let tenants_dir = conf.tenants_path();
 
+    let mut tenants = HashMap::new();
+
     let mut dir_entries = fs::read_dir(&tenants_dir)
         .await
         .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
 
+    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
+
     loop {
         match dir_entries.next_entry().await {
             Ok(None) => break,
@@ -86,10 +117,10 @@ pub async fn init_tenant_mgr(
                         conf,
                         &tenant_dir_path,
                         remote_storage.clone(),
+                        &ctx,
                     ) {
                         Ok(tenant) => {
-                            TENANTS.write().await.insert(tenant.tenant_id(), tenant);
-                            number_of_tenants += 1;
+                            tenants.insert(tenant.tenant_id(), tenant);
                         }
                         Err(e) => {
                             error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
@@ -108,7 +139,11 @@ pub async fn init_tenant_mgr(
         }
     }
 
-    info!("Processed {number_of_tenants} local tenants at startup");
+    info!("Processed {} local tenants at startup", tenants.len());
+
+    let mut tenants_map = TENANTS.write().await;
+    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
+    *tenants_map = TenantsMap::Open(tenants);
     Ok(())
 }
 
@@ -116,6 +151,7 @@ pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     remote_storage: Option<GenericRemoteStorage>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
         tenant_path.is_dir(),
@@ -150,7 +186,7 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage)
+            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
         } else {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
             Tenant::create_broken_tenant(conf, tenant_id)
@@ -158,7 +194,7 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage)
+        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
     };
     Ok(tenant)
 }
@@ -166,21 +202,44 @@ pub fn schedule_local_tenant_processing(
 ///
 /// Shut down all tenants. This runs as part of pageserver shutdown.
 ///
+/// NB: We leave the tenants in the map, so that they remain accessible through
+/// the management API until we shut it down. If we removed the shut-down tenants
+/// from the tenants map, the management API would return 404 for these tenants,
+/// because TenantsMap::get() now returns `None`.
+/// That could be easily misinterpreted by control plane, the consumer of the
+/// management API. For example, it could attach the tenant on a different pageserver.
+/// We would then be in split-brain once this pageserver restarts.
 pub async fn shutdown_all_tenants() {
+    // Prevent new tenants from being created.
     let tenants_to_shut_down = {
         let mut m = TENANTS.write().await;
-        let mut tenants_to_shut_down = Vec::with_capacity(m.len());
-        for (_, tenant) in m.drain() {
-            if tenant.is_active() {
-                // updates tenant state, forbidding new GC and compaction iterations from starting
-                tenant.set_stopping();
-                tenants_to_shut_down.push(tenant)
+        match &mut *m {
+            TenantsMap::Initializing => {
+                *m = TenantsMap::ShuttingDown(HashMap::default());
+                info!("tenants map is empty");
+                return;
+            }
+            TenantsMap::Open(tenants) => {
+                let tenants_clone = tenants.clone();
+                *m = TenantsMap::ShuttingDown(std::mem::take(tenants));
+                tenants_clone
+            }
+            TenantsMap::ShuttingDown(_) => {
+                error!("already shutting down, this function isn't supposed to be called more than once");
+                return;
             }
         }
-        drop(m);
-        tenants_to_shut_down
     };
 
+    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
+    for (_, tenant) in tenants_to_shut_down {
+        if tenant.is_active() {
+            // updates tenant state, forbidding new GC and compaction iterations from starting
+            tenant.set_stopping();
+            tenants_to_freeze_and_flush.push(tenant);
+        }
+    }
+
     // Shut down all existing walreceiver connections and stop accepting the new ones.
     task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
 
@@ -192,7 +251,7 @@ pub async fn shutdown_all_tenants() {
     // should be no more activity in any of the repositories.
     //
     // On error, log it but continue with the shutdown for other tenants.
-    for tenant in tenants_to_shut_down {
+    for tenant in tenants_to_freeze_and_flush {
         let tenant_id = tenant.tenant_id();
         debug!("shutdown tenant {tenant_id}");
 
@@ -207,27 +266,23 @@ pub async fn create_tenant(
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<Option<Arc<Tenant>>> {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(_) => {
-            debug!("tenant {tenant_id} already exists");
-            Ok(None)
-        }
-        hash_map::Entry::Vacant(v) => {
-            // Hold the write_tenants() lock, since all of this is local IO.
-            // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
-            let created_tenant =
-                schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
-            let crated_tenant_id = created_tenant.tenant_id();
-            anyhow::ensure!(
+    ctx: &RequestContext,
+) -> Result<Arc<Tenant>, TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
+        // We're holding the tenants lock in write mode while doing local IO.
+        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
+        // and do the work in that state.
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
+        let created_tenant =
+            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+        let crated_tenant_id = created_tenant.tenant_id();
+        anyhow::ensure!(
                 tenant_id == crated_tenant_id,
                 "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
             );
-            v.insert(Arc::clone(&created_tenant));
-            Ok(Some(created_tenant))
-        }
-    }
+        vacant_entry.insert(Arc::clone(&created_tenant));
+        Ok(created_tenant)
+    }).await
 }
 
 pub async fn update_tenant_config(
@@ -236,10 +291,11 @@ pub async fn update_tenant_config(
     tenant_id: TenantId,
 ) -> anyhow::Result<()> {
     info!("configuring tenant {tenant_id}");
-    get_tenant(tenant_id, true)
-        .await?
-        .update_tenant_config(tenant_conf);
-    Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
+    let tenant = get_tenant(tenant_id, true).await?;
+
+    tenant.update_tenant_config(tenant_conf);
+    let tenant_config_path = conf.tenant_config_path(tenant_id);
+    Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?;
     Ok(())
 }
 
@@ -260,10 +316,14 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
     }
 }
 
-pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
+pub async fn delete_timeline(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
     match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
-            tenant.delete_timeline(timeline_id).await?;
+            tenant.delete_timeline(timeline_id, ctx).await?;
         }
         Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
     }
@@ -291,8 +351,9 @@ pub async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
         if tenant_ignore_mark.exists() {
@@ -300,7 +361,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -329,16 +390,24 @@ pub async fn ignore_tenant(
     .await
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapListError {
+    #[error("tenant map is still initiailizing")]
+    Initializing,
+}
+
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
-    TENANTS
-        .read()
-        .await
-        .iter()
+pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+    let tenants = TENANTS.read().await;
+    let m = match &*tenants {
+        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+    };
+    Ok(m.iter()
         .map(|(id, tenant)| (*id, tenant.current_state()))
-        .collect()
+        .collect())
 }
 
 /// Execute Attach mgmt API command.
@@ -349,34 +418,62 @@ pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: GenericRemoteStorage,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         anyhow::ensure!(
             !tenant_path.exists(),
             "Cannot attach tenant {tenant_id}, local tenant directory already exists"
         );
 
-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
+        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
         vacant_entry.insert(tenant);
-
         Ok(())
     })
     .await
 }
 
-async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapInsertError {
+    #[error("tenant map is still initializing")]
+    StillInitializing,
+    #[error("tenant map is shutting down")]
+    ShuttingDown,
+    #[error("tenant {0} already exists, state: {1:?}")]
+    TenantAlreadyExists(TenantId, TenantState),
+    #[error(transparent)]
+    Closure(#[from] anyhow::Error),
+}
+
+/// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that
+/// entry is vacant. The closure is responsible for creating the tenant object and inserting
+/// it into the tenants map through the vacnt entry that it receives as argument.
+///
+/// NB: the closure should return quickly because the current implementation of tenants map
+/// serializes access through an `RwLock`.
+async fn tenant_map_insert<F, V>(
+    tenant_id: TenantId,
+    insert_fn: F,
+) -> Result<V, TenantMapInsertError>
 where
     F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
 {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(e) => {
-            anyhow::bail!(
-                "tenant {tenant_id} already exists, state: {:?}",
-                e.get().current_state()
-            )
-        }
-        hash_map::Entry::Vacant(v) => run(v),
+    let mut guard = TENANTS.write().await;
+    let m = match &mut *guard {
+        TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing),
+        TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown),
+        TenantsMap::Open(m) => m,
+    };
+    match m.entry(tenant_id) {
+        hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists(
+            tenant_id,
+            e.get().current_state(),
+        )),
+        hash_map::Entry::Vacant(v) => match insert_fn(v) {
+            Ok(v) => Ok(v),
+            Err(e) => Err(TenantMapInsertError::Closure(e)),
+        },
     }
 }
 
@@ -449,9 +546,9 @@ pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
     let guard = TENANTS.read().await;
-
     let tenant = guard
         .get(&tenant_id)
         .map(Arc::clone)
@@ -462,7 +559,8 @@ pub async fn immediate_gc(
     // Use tenant's pitr setting
     let pitr = tenant.get_pitr_interval();
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -474,7 +572,7 @@ pub async fn immediate_gc(
         async move {
             fail::fail_point!("immediate_gc_task_pre");
             let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
                 .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
@@ -497,6 +595,7 @@ pub async fn immediate_gc(
 pub async fn immediate_compact(
     tenant_id: TenantId,
     timeline_id: TimelineId,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
     let guard = TENANTS.read().await;
 
@@ -510,7 +609,8 @@ pub async fn immediate_compact(
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -523,7 +623,7 @@ pub async fn immediate_compact(
         false,
         async move {
             let result = timeline
-                .compact()
+                .compact(&ctx)
                 .instrument(
                     info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
                 )
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 013591caee..3f69017160 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1010,7 +1010,10 @@ impl RemoteTimelineClient {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use crate::{
+        tenant::harness::{TenantHarness, TIMELINE_ID},
+        DEFAULT_PG_VERSION,
+    };
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     use std::{collections::HashSet, path::Path};
     use utils::lsn::Lsn;
@@ -1064,9 +1067,19 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
+        // Use a current-thread runtime in the test
+        let runtime = Box::leak(Box::new(
+            tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()?,
+        ));
+        let _entered = runtime.enter();
+
         let harness = TenantHarness::create("upload_scheduling")?;
+        let (tenant, ctx) = runtime.block_on(harness.load());
+        let _timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        std::fs::create_dir_all(&timeline_path)?;
 
         let remote_fs_dir = harness.conf.workdir.join("remote_fs");
         std::fs::create_dir_all(remote_fs_dir)?;
@@ -1084,14 +1097,6 @@ mod tests {
             storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
         };
 
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
         // Test outline:
         //
         // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 61cb32fc76..2fed4f88b3 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,6 +6,7 @@ use anyhow::Context;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
 use super::Tenant;
@@ -181,6 +182,7 @@ pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
     // our advantage with `?` error handling.
@@ -188,7 +190,7 @@ pub(super) async fn gather_inputs(
 
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
     tenant
-        .refresh_gc_info()
+        .refresh_gc_info(ctx)
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
@@ -329,7 +331,13 @@ pub(super) async fn gather_inputs(
             } else {
                 let timeline = Arc::clone(&timeline);
                 let parallel_size_calcs = Arc::clone(limit);
-                joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
+                let ctx = ctx.attached_child();
+                joinset.spawn(calculate_logical_size(
+                    parallel_size_calcs,
+                    timeline,
+                    *lsn,
+                    ctx,
+                ));
             }
         }
 
@@ -387,6 +395,7 @@ pub(super) async fn gather_inputs(
                 parallel_size_calcs,
                 timeline.clone(),
                 lsn,
+                ctx.attached_child(),
             ));
 
             if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
@@ -582,13 +591,14 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
+    ctx: RequestContext,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
     let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
     let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn)
+        .spawn_ondemand_logical_size_calculation(lsn, ctx)
         .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
         .await?;
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 6aee8ce23c..2149fc7eb7 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -196,3 +196,50 @@ pub fn downcast_remote_layer(
         None
     }
 }
+
+impl std::fmt::Debug for dyn Layer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Layer")
+            .field("short_id", &self.short_id())
+            .finish()
+    }
+}
+
+/// Holds metadata about a layer without any content. Used mostly for testing.
+pub struct LayerDescriptor {
+    pub key: Range<Key>,
+    pub lsn: Range<Lsn>,
+    pub is_incremental: bool,
+    pub short_id: String,
+}
+
+impl Layer for LayerDescriptor {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn.clone()
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_data: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        todo!("This method shouldn't be part of the Layer trait")
+    }
+
+    fn short_id(&self) -> String {
+        self.short_id.clone()
+    }
+
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        todo!()
+    }
+}
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b7ad8fe791..b126545ee4 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -5,6 +5,7 @@ use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
@@ -52,19 +53,20 @@ async fn compaction_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
                 _ = task_mgr::shutdown_watcher() => {
                     info!("received cancellation request");
-                    return;
+                return;
                 },
                 tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
                     ControlFlow::Continue(tenant) => tenant,
                 },
-            };
+        };
 
             let mut sleep_duration = tenant.get_compaction_period();
             if sleep_duration == Duration::ZERO {
@@ -73,7 +75,7 @@ async fn compaction_loop(tenant_id: TenantId) {
                 sleep_duration = Duration::from_secs(10);
             } else {
                 // Run compaction
-                if let Err(e) = tenant.compaction_iteration().await {
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
                     sleep_duration = wait_duration;
                     error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
                 }
@@ -103,6 +105,9 @@ async fn gc_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        // GC might require downloading, to find the cutoff LSN that corresponds to the
+        // cutoff specified as time.
+        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
@@ -127,7 +132,7 @@ async fn gc_loop(tenant_id: TenantId) {
             } else {
                 // Run gc
                 if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
                     {
                         sleep_duration = wait_duration;
                         error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d59858f582..0ca8a0e491 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,7 @@
 //!
 
+mod walreceiver;
+
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
@@ -13,6 +15,7 @@ use pageserver_api::models::{
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::id::TenantTimelineId;
 
 use std::cmp::{max, min, Ordering};
 use std::collections::HashMap;
@@ -23,6 +26,8 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::broker_client::is_broker_client_initialized;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
@@ -58,11 +63,11 @@ use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
-use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
+use walreceiver::spawn_connection_manager_task;
 
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
@@ -128,7 +133,6 @@ pub struct Timeline {
     ancestor_timeline: Option<Arc<Timeline>>,
     ancestor_lsn: Lsn,
 
-    // Metrics
     metrics: TimelineMetrics,
 
     /// Ensures layers aren't frozen by checkpointer between
@@ -377,6 +381,12 @@ pub enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
 
+    /// The operation would require downloading a layer that is missing locally.
+    NeedsDownload(TenantTimelineId, LayerFileName),
+
+    /// The operation was cancelled
+    Cancelled,
+
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(#[from] crate::walredo::WalRedoError),
@@ -386,6 +396,33 @@ impl std::fmt::Debug for PageReconstructError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
         match self {
             Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+impl std::fmt::Display for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
             Self::WalRedo(err) => err.fmt(f),
         }
     }
@@ -422,11 +459,24 @@ impl Timeline {
     /// an ancestor branch, for example, or waste a lot of cycles chasing the
     /// non-existing key.
     ///
-    pub async fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
+    pub async fn get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
+
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
         // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
@@ -450,7 +500,7 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
 
         self.metrics
@@ -513,13 +563,25 @@ impl Timeline {
     /// You should call this before any of the other get_* or list_* functions. Calling
     /// those functions with an LSN that has been processed yet is an error.
     ///
-    pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
+    pub async fn wait_lsn(
+        &self,
+        lsn: Lsn,
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
+    ) -> anyhow::Result<()> {
         anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
 
         // This should never be called from the WAL receiver, because that could lead
         // to a deadlock.
         anyhow::ensure!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
             "wait_lsn cannot be called in WAL receiver"
         );
 
@@ -558,7 +620,7 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
-    pub async fn compact(&self) -> anyhow::Result<()> {
+    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         let last_record_lsn = self.get_last_record_lsn();
 
         // Last record Lsn could be zero in case the timeline was just created
@@ -616,14 +678,16 @@ impl Timeline {
             .repartition(
                 self.get_last_record_lsn(),
                 self.get_compaction_target_size(),
+                ctx,
             )
             .await
         {
             Ok((partitioning, lsn)) => {
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
-                let layer_paths_to_upload =
-                    self.create_image_layers(&partitioning, lsn, false).await?;
+                let layer_paths_to_upload = self
+                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .await?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -673,7 +737,10 @@ impl Timeline {
     /// the initial size calculation has not been run (gets triggered on the first size access).
     ///
     /// return size and boolean flag that shows if the size is exact
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
+    pub fn get_current_logical_size(
+        self: &Arc<Self>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(u64, bool)> {
         let current_size = self.current_logical_size.current_size()?;
         debug!("Current size: {current_size:?}");
 
@@ -683,7 +750,7 @@ impl Timeline {
             (current_size, self.current_logical_size.initial_part_end)
         {
             is_exact = false;
-            self.try_spawn_size_init_task(init_lsn);
+            self.try_spawn_size_init_task(init_lsn, ctx);
         }
 
         Ok((size, is_exact))
@@ -729,16 +796,24 @@ impl Timeline {
         Ok(())
     }
 
+    pub fn activate(self: &Arc<Self>) {
+        self.set_state(TimelineState::Active);
+        self.launch_wal_receiver();
+    }
+
     pub fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+            }
+            (st, TimelineState::Loading) => {
+                error!("ignoring transition from {st:?} into Loading state");
             }
             (TimelineState::Broken, _) => {
                 error!("Ignoring state update {new_state:?} for broken tenant");
             }
             (TimelineState::Stopping, TimelineState::Active) => {
-                debug!("Not activating a Stopping timeline");
+                error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
                 self.state.send_replace(new_state);
@@ -812,7 +887,7 @@ impl Timeline {
         pg_version: u32,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Suspended);
+        let (state, _) = watch::channel(TimelineState::Loading);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -884,6 +959,10 @@ impl Timeline {
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
+                .metrics
+                .last_record_gauge
+                .set(disk_consistent_lsn.0 as i64);
+            result
         })
     }
 
@@ -909,22 +988,25 @@ impl Timeline {
 
         let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
         let self_clone = Arc::clone(self);
+
         info!("spawning flush loop");
         task_mgr::spawn(
-                    task_mgr::BACKGROUND_RUNTIME.handle(),
-                    task_mgr::TaskKind::LayerFlushTask,
-                    Some(self.tenant_id),
-                    Some(self.timeline_id),
-                    "layer flush task",
-                    false,
-                    async move {
-                         self_clone.flush_loop(layer_flush_start_rx).await;
-                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
-                         *flush_loop_state  = FlushLoopState::Exited;
-                         Ok(()) }
-                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
-                );
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::LayerFlushTask,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "layer flush task",
+            false,
+            async move {
+                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
+                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                *flush_loop_state  = FlushLoopState::Exited;
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
+        );
 
         *flush_loop_state = FlushLoopState::Running;
     }
@@ -955,12 +1037,16 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
         drop(tenant_conf_guard);
         let self_clone = Arc::clone(self);
+        let background_ctx =
+            // XXX: this is a detached_child. Plumb through the ctx from call sites.
+            RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
         spawn_connection_manager_task(
             self_clone,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
             max_lsn_wal_lag,
             crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+            background_ctx,
         );
     }
 
@@ -970,6 +1056,7 @@ impl Timeline {
     ///
     pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut num_layers = 0;
 
         let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1010,7 +1097,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1041,7 +1128,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1067,6 +1154,7 @@ impl Timeline {
             }
         }
 
+        updates.flush();
         layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
 
         info!(
@@ -1091,6 +1179,11 @@ impl Timeline {
         // Are we missing some files that are present in remote storage?
         // Create RemoteLayer instances for them.
         let mut local_only_layers = local_layers;
+
+        // We're holding a layer map lock for a while but this
+        // method is only called during init so it's fine.
+        let mut layer_map = self.layers.write().unwrap();
+        let mut updates = layer_map.batch_update();
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
 
@@ -1129,7 +1222,7 @@ impl Timeline {
                             anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                         } else {
                             self.metrics.resident_physical_size_gauge.sub(local_size);
-                            self.layers.write().unwrap().remove_historic(local_layer);
+                            updates.remove_historic(local_layer);
                             // fall-through to adding the remote layer
                         }
                     } else {
@@ -1171,7 +1264,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1194,13 +1287,14 @@ impl Timeline {
                         &remote_layer_metadata,
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 #[cfg(test)]
                 LayerFileName::Test(_) => unreachable!(),
             }
         }
 
+        updates.flush();
         Ok(local_only_layers)
     }
 
@@ -1280,7 +1374,7 @@ impl Timeline {
         Ok(())
     }
 
-    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
+    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
         let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
             .try_acquire_owned()
         {
@@ -1296,8 +1390,18 @@ impl Timeline {
             .initial_logical_size
             .get()
             .is_none());
+
+        info!(
+            "spawning logical size computation from context of task kind {:?}",
+            ctx.task_kind()
+        );
         // We need to start the computation task.
+        // It gets a separate context since it will outlive the request that called this function.
         let self_clone = Arc::clone(self);
+        let background_ctx = ctx.detached_child(
+            TaskKind::InitialLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
@@ -1307,7 +1411,9 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await
+                let calculated_size = match self_clone
+                    .logical_size_calculation_task(init_lsn, &background_ctx)
+                    .await
                 {
                     Ok(s) => s,
                     Err(CalculateLogicalSizeError::Cancelled) => {
@@ -1342,18 +1448,27 @@ impl Timeline {
     pub fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
+        ctx: RequestContext,
     ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
         let (sender, receiver) = oneshot::channel();
         let self_clone = Arc::clone(self);
+        // XXX if our caller loses interest, i.e., ctx is cancelled,
+        // we should stop the size calculation work and return an error.
+        // That would require restructuring this function's API to
+        // return the result directly, instead of a Receiver for the result.
+        let ctx = ctx.detached_child(
+            TaskKind::OndemandLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
-            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
             Some(self.tenant_id),
             Some(self.timeline_id),
             "ondemand logical size calculation",
             false,
             async move {
-                let res = self_clone.logical_size_calculation_task(lsn).await;
+                let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
             },
@@ -1365,6 +1480,7 @@ impl Timeline {
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
         init_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         let mut timeline_state_updates = self.subscribe_for_state_updates();
         let self_calculation = Arc::clone(self);
@@ -1372,12 +1488,13 @@ impl Timeline {
 
         let calculation = async {
             let cancel = cancel.child_token();
+            let ctx = ctx.attached_child();
             tokio::task::spawn_blocking(move || {
                 // Run in a separate thread since this can do a lot of
                 // synchronous file IO without .await inbetween
                 // if there are no RemoteLayers that would require downloading.
                 let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel))
+                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
             })
             .await
             .context("Failed to spawn calculation result task")?
@@ -1392,7 +1509,7 @@ impl Timeline {
                             TimelineState::Active => continue,
                             TimelineState::Broken
                             | TimelineState::Stopping
-                            | TimelineState::Suspended => {
+                            | TimelineState::Loading => {
                                 break format!("aborted because timeline became inactive (new state: {new_state:?})")
                             }
                         }
@@ -1432,10 +1549,11 @@ impl Timeline {
     ///
     /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
     /// especially if we need to download remote layers.
-    async fn calculate_logical_size(
+    pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
@@ -1478,7 +1596,7 @@ impl Timeline {
             self.metrics.logical_size_histo.start_timer()
         };
         let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, cancel)
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
             .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
@@ -1555,6 +1673,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         // Start from the current timeline.
         let mut timeline_owned;
@@ -1742,14 +1861,43 @@ impl Timeline {
                 let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
                     Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
                 let id = remote_layer_as_persistent.traversal_id();
-                info!("need remote layer {id}");
+                info!(
+                    "need remote layer {} for task kind {:?}",
+                    id,
+                    ctx.task_kind()
+                );
 
                 // The next layer doesn't exist locally. Need to download it.
                 // (The control flow is a bit complicated here because we must drop the 'layers'
                 // lock before awaiting on the Future.)
-                info!("on-demand downloading remote layer {id}");
-                timeline.download_remote_layer(remote_layer).await?;
-                continue 'layer_map_search;
+                match (
+                    ctx.download_behavior(),
+                    self.conf.ondemand_download_behavior_treat_error_as_warn,
+                ) {
+                    (DownloadBehavior::Download, _) => {
+                        info!(
+                            "on-demand downloading remote layer {id} for task kind {:?}",
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Warn, _) | (DownloadBehavior::Error, true) => {
+                        warn!(
+                            "unexpectedly on-demand downloading remote layer {} for task kind {:?}",
+                            id,
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Error, false) => {
+                        return Err(PageReconstructError::NeedsDownload(
+                            TenantTimelineId::new(self.tenant_id, self.timeline_id),
+                            remote_layer.file_name.clone(),
+                        ))
+                    }
+                }
             }
         }
     }
@@ -1871,7 +2019,11 @@ impl Timeline {
     }
 
     /// Layer flusher task's main loop.
-    async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
+    async fn flush_loop(
+        &self,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
+        ctx: &RequestContext,
+    ) {
         info!("started flush loop");
         loop {
             tokio::select! {
@@ -1892,7 +2044,7 @@ impl Timeline {
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 if let Some(layer_to_flush) = layer_to_flush {
-                    if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
+                    if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
                         error!("could not flush frozen layer: {err:?}");
                         break Err(err);
                     }
@@ -1957,8 +2109,12 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
-    async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
+    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    async fn flush_frozen_layer(
+        &self,
+        frozen_layer: Arc<InMemoryLayer>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
@@ -1966,10 +2122,12 @@ impl Timeline {
         let lsn_range = frozen_layer.get_lsn_range();
         let layer_paths_to_upload =
             if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+                // require downloading anything during initial import.
                 let (partitioning, _lsn) = self
-                    .repartition(self.initdb_lsn, self.get_compaction_target_size())
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                     .await?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)
+                self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
                     .await?
             } else {
                 // normal case, write out a L0 delta layer file.
@@ -2099,10 +2257,11 @@ impl Timeline {
         ])?;
 
         // Add it to the layer map
-        {
-            let mut layers = self.layers.write().unwrap();
-            layers.insert_historic(Arc::new(new_delta));
-        }
+        self.layers
+            .write()
+            .unwrap()
+            .batch_update()
+            .insert_historic(Arc::new(new_delta));
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2119,6 +2278,7 @@ impl Timeline {
         &self,
         lsn: Lsn,
         partition_size: u64,
+        ctx: &RequestContext,
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
             let partitioning_guard = self.partitioning.lock().unwrap();
@@ -2129,7 +2289,7 @@ impl Timeline {
                 return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
             }
         }
-        let keyspace = self.collect_keyspace(lsn).await?;
+        let keyspace = self.collect_keyspace(lsn, ctx).await?;
         let partitioning = keyspace.partition(partition_size);
 
         let mut partitioning_guard = self.partitioning.lock().unwrap();
@@ -2166,13 +2326,15 @@ impl Timeline {
                 // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
-                    let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
+                    let threshold = self.get_image_creation_threshold();
+                    let num_deltas =
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
 
                     debug!(
                         "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                         img_range.start, img_range.end, num_deltas, img_lsn, lsn
                     );
-                    if num_deltas >= self.get_image_creation_threshold() {
+                    if num_deltas >= threshold {
                         return Ok(true);
                     }
                 }
@@ -2187,6 +2349,7 @@ impl Timeline {
         partitioning: &KeyPartitioning,
         lsn: Lsn,
         force: bool,
+        ctx: &RequestContext,
     ) -> Result<HashMap<LayerFileName, LayerFileMetadata>, PageReconstructError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers: Vec<ImageLayer> = Vec::new();
@@ -2211,7 +2374,7 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
-                        let img = match self.get(key, lsn).await {
+                        let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
                                 // If we fail to reconstruct a VM or FSM page, we can zero the
@@ -2267,21 +2430,23 @@ impl Timeline {
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for l in image_layers {
             let path = l.filename();
             let metadata = timeline_path
                 .join(path.file_name())
                 .metadata()
-                .context("reading metadata of layer file {path}")?;
+                .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            layers.insert_historic(Arc::new(l));
+            updates.insert_historic(Arc::new(l));
         }
+        updates.flush();
         drop(layers);
         timer.stop_and_record();
 
@@ -2577,6 +2742,7 @@ impl Timeline {
         }
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
         for l in new_layers {
             let new_delta_path = l.path();
@@ -2597,7 +2763,7 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            layers.insert_historic(x);
+            updates.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can
@@ -2611,8 +2777,9 @@ impl Timeline {
             }
             layer_names_to_delete.push(l.filename());
             l.delete()?;
-            layers.remove_historic(l);
+            updates.remove_historic(l);
         }
+        updates.flush();
         drop(layers);
 
         // Also schedule the deletions in remote storage
@@ -2662,6 +2829,7 @@ impl Timeline {
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
@@ -2674,7 +2842,7 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self.find_lsn_for_timestamp(pitr_timestamp).await? {
+                match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? {
                     LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {
                         // The timestamp is in the future. That sounds impossible,
@@ -2725,6 +2893,8 @@ impl Timeline {
     /// obsolete.
     ///
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+        let timer = self.metrics.garbage_collect_histo.start_timer();
+
         fail_point!("before-timeline-gc");
 
         let _layer_removal_cs = self.layer_removal_cs.lock().await;
@@ -2745,11 +2915,17 @@ impl Timeline {
 
         let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
 
-        self.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
+        let res = self
+            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
             .instrument(
                 info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
             )
-            .await
+            .await?;
+
+        // only record successes
+        timer.stop_and_record();
+
+        Ok(res)
     }
 
     async fn gc_timeline(
@@ -2812,6 +2988,7 @@ impl Timeline {
         // 3. it doesn't need to be retained for 'retain_lsns';
         // 4. newer on-disk image layers cover the layer's whole key range
         //
+        // TODO holding a write lock is too agressive and avoidable
         let mut layers = self.layers.write().unwrap();
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
@@ -2843,6 +3020,8 @@ impl Timeline {
             // might be referenced by child branches forever.
             // We can track this in child timeline GC and delete parent layers when
             // they are no longer needed. This might be complicated with long inheritance chains.
+            //
+            // TODO Vec is not a great choice for `retain_lsns`
             for retain_lsn in &retain_lsns {
                 // start_lsn is inclusive
                 if &l.get_lsn_range().start <= retain_lsn {
@@ -2896,6 +3075,7 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
+        let mut updates = layers.batch_update();
         if !layers_to_remove.is_empty() {
             // Persist the new GC cutoff value in the metadata file, before
             // we actually remove anything.
@@ -2913,7 +3093,13 @@ impl Timeline {
                 }
                 layer_names_to_delete.push(doomed_layer.filename());
                 doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
-                layers.remove_historic(doomed_layer);
+
+                // TODO Removing from the bottom of the layer map is expensive.
+                //      Maybe instead discard all layer map historic versions that
+                //      won't be needed for page reconstruction for this timeline,
+                //      and mark what we can't delete yet as deleted from the layer
+                //      map index without actually rebuilding the index.
+                updates.remove_historic(doomed_layer);
                 result.layers_removed += 1;
             }
 
@@ -2925,6 +3111,7 @@ impl Timeline {
                 remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
             }
         }
+        updates.flush();
 
         info!(
             "GC completed removing {} layers, cutoff {}",
@@ -3081,11 +3268,13 @@ impl Timeline {
                     // Delta- or ImageLayer in the layer map.
                     let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
                     let mut layers = self_clone.layers.write().unwrap();
+                    let mut updates = layers.batch_update();
                     {
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        layers.remove_historic(l);
+                        updates.remove_historic(l);
                     }
-                    layers.insert_historic(new_layer);
+                    updates.insert_historic(new_layer);
+                    updates.flush();
                     drop(layers);
 
                     // Now that we've inserted the download into the layer map,
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
similarity index 83%
rename from pageserver/src/walreceiver.rs
rename to pageserver/src/tenant/timeline/walreceiver.rs
index fc9daadc5c..f33a12c5cc 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -23,58 +23,15 @@
 mod connection_manager;
 mod walreceiver_connection;
 
-use crate::config::PageServerConf;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
 
-use anyhow::Context;
-use once_cell::sync::OnceCell;
 use std::future::Future;
-use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 pub use connection_manager::spawn_connection_manager_task;
 
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
-
 /// A handle of an asynchronous task.
 /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`]
 /// and a cancellation token that it can listen to for earlier interrupts.
@@ -95,7 +52,6 @@ pub enum TaskEvent<E> {
 
 #[derive(Debug, Clone)]
 pub enum TaskStateUpdate<E> {
-    Init,
     Started,
     Progress(E),
 }
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
similarity index 96%
rename from pageserver/src/walreceiver/connection_manager.rs
rename to pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 8b60e59305..cd7c7c51d2 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -11,10 +11,12 @@
 
 use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
 
-use crate::task_mgr::TaskKind;
+use super::TaskStateUpdate;
+use crate::broker_client::get_broker_client;
+use crate::context::RequestContext;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::Timeline;
-use crate::{task_mgr, walreceiver::TaskStateUpdate};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -27,10 +29,7 @@ use storage_broker::Streaming;
 use tokio::{select, sync::watch};
 use tracing::*;
 
-use crate::{
-    exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
-    DEFAULT_MAX_BACKOFF_SECONDS,
-};
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::{
     id::{NodeId, TenantTimelineId},
@@ -46,6 +45,7 @@ pub fn spawn_connection_manager_task(
     lagging_wal_timeout: Duration,
     max_lsn_wal_lag: NonZeroU64,
     auth_token: Option<Arc<String>>,
+    ctx: RequestContext,
 ) {
     let mut broker_client = get_broker_client().clone();
 
@@ -78,6 +78,7 @@ pub fn spawn_connection_manager_task(
                     loop_step_result = connection_manager_loop_step(
                         &mut broker_client,
                         &mut walreceiver_state,
+                        &ctx,
                     ) => match loop_step_result {
                         ControlFlow::Continue(()) => continue,
                         ControlFlow::Break(()) => {
@@ -101,6 +102,7 @@ pub fn spawn_connection_manager_task(
 async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
     walreceiver_state: &mut WalreceiverState,
+    ctx: &RequestContext,
 ) -> ControlFlow<(), ()> {
     let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
 
@@ -145,7 +147,7 @@ async fn connection_manager_loop_step(
                 let wal_connection = walreceiver_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
-                    TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
+                    TaskEvent::Update(TaskStateUpdate::Started) => {},
                     TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
                         if new_status.has_processed_wal {
                             // We have advanced last_record_lsn by processing the WAL received
@@ -183,13 +185,23 @@ async fn connection_manager_loop_step(
 
             new_event = async {
                 loop {
+                    if walreceiver_state.timeline.current_state() == TimelineState::Loading {
+                        warn!("wal connection manager should only be launched after timeline has become active");
+                    }
                     match timeline_state_updates.changed().await {
                         Ok(()) => {
                             let new_state = walreceiver_state.timeline.current_state();
                             match new_state {
                                 // we're already active as walreceiver, no need to reactivate
                                 TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                                TimelineState::Broken | TimelineState::Stopping => {
+                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    return ControlFlow::Break(());
+                                }
+                                TimelineState::Loading => {
+                                    warn!("timeline transitioned back to Loading state, that should not happen");
+                                    return ControlFlow::Continue(new_state);
+                                }
                             }
                         }
                         Err(_sender_dropped_error) => return ControlFlow::Break(()),
@@ -197,7 +209,7 @@ async fn connection_manager_loop_step(
                 }
             } => match new_event {
                 ControlFlow::Continue(new_state) => {
-                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
+                    info!("observed timeline state change, new state is {new_state:?}");
                     return ControlFlow::Continue(());
                 }
                 ControlFlow::Break(()) => {
@@ -226,6 +238,7 @@ async fn connection_manager_loop_step(
                 .change_connection(
                     new_candidate.safekeeper_id,
                     new_candidate.wal_source_connconf,
+                    ctx,
                 )
                 .await
         }
@@ -289,7 +302,9 @@ async fn subscribe_for_timeline_updates(
                 return resp.into_inner();
             }
             Err(e) => {
-                warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
+                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
+                // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
+                info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
                 continue;
             }
         }
@@ -389,12 +404,17 @@ impl WalreceiverState {
         &mut self,
         new_sk_id: NodeId,
         new_wal_source_connconf: PgConnectionConfig,
+        ctx: &RequestContext,
     ) {
         self.drop_old_connection(true).await;
 
         let id = self.id;
         let connect_timeout = self.wal_connect_timeout;
         let timeline = Arc::clone(&self.timeline);
+        let ctx = ctx.detached_child(
+            TaskKind::WalReceiverConnectionHandler,
+            ctx.download_behavior(),
+        );
         let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
             async move {
                 super::walreceiver_connection::handle_walreceiver_connection(
@@ -403,6 +423,7 @@ impl WalreceiverState {
                     events_sender,
                     cancellation,
                     connect_timeout,
+                    ctx,
                 )
                 .await
                 .context("walreceiver connection handling failure")
@@ -1233,18 +1254,18 @@ mod tests {
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
     async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .expect("Failed to create an empty timeline for dummy wal connection manager");
+        let timeline = timeline.initialize(&ctx).unwrap();
+
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
                 timeline_id: TIMELINE_ID,
             },
-            timeline: harness
-                .load()
-                .await
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
-                .expect("Failed to create an empty timeline for dummy wal connection manager")
-                .initialize()
-                .unwrap(),
+            timeline,
             wal_connect_timeout: Duration::from_secs(1),
             lagging_wal_timeout: Duration::from_secs(1),
             max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
similarity index 94%
rename from pageserver/src/walreceiver/walreceiver_connection.rs
rename to pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1b9e4923fb..7e06c398af 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,7 +22,9 @@ use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
+use super::TaskStateUpdate;
+use crate::context::RequestContext;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -62,6 +64,7 @@ pub async fn handle_walreceiver_connection(
     events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
     cancellation: CancellationToken,
     connect_timeout: Duration,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Connect to the database in replication mode.
     info!("connecting to {wal_source_connconf:?}");
@@ -77,9 +80,13 @@ pub async fn handle_walreceiver_connection(
                 info!("DB connection stream finished: {expected_error}");
                 return Ok(());
             }
-            Err(elapsed) => anyhow::bail!(
-                "Timed out while waiting {elapsed} for walreceiver connection to open"
-            ),
+            Err(_) => {
+                // Timing out to connect to a safekeeper node could happen long time, due to
+                // many reasons that pageserver cannot control.
+                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
+                info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
+                return Ok(());
+            }
         }
     };
 
@@ -99,10 +106,14 @@ pub async fn handle_walreceiver_connection(
 
     // The connection object performs the actual communication with the database,
     // so spawn it off to run on its own.
+    let _connection_ctx = ctx.detached_child(
+        TaskKind::WalReceiverConnectionPoller,
+        ctx.download_behavior(),
+    );
     let connection_cancellation = cancellation.clone();
     task_mgr::spawn(
         WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverConnection,
+        TaskKind::WalReceiverConnectionPoller,
         Some(timeline.tenant_id),
         Some(timeline.timeline_id),
         "walreceiver connection",
@@ -117,7 +128,7 @@ pub async fn handle_walreceiver_connection(
                         }
                     }
                 },
-
+                // Future: replace connection_cancellation with connection_ctx cancellation
                 _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
             }
             Ok(())
@@ -180,7 +191,7 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
 
     while let Some(replication_message) = {
         select! {
@@ -251,7 +262,7 @@ pub async fn handle_walreceiver_connection(
                         ensure!(lsn.is_aligned());
 
                         walingest
-                            .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
+                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
 
@@ -329,7 +340,7 @@ pub async fn handle_walreceiver_connection(
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
             let (timeline_logical_size, _) = timeline
-                .get_current_logical_size()
+                .get_current_logical_size(&ctx)
                 .context("Status update creation failed to get current logical size")?;
             let status_update = ReplicationFeedback {
                 current_timeline_size: timeline_logical_size,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 0de2e6654d..3761c65668 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,6 +29,7 @@ use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
@@ -52,10 +53,14 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
+    pub async fn new(
+        timeline: &'a Timeline,
+        startpoint: Lsn,
+        ctx: &'_ RequestContext,
+    ) -> anyhow::Result<WalIngest<'a>> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint).await?;
+        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
         let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
@@ -80,6 +85,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         modification.lsn = lsn;
         decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
@@ -97,7 +103,7 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            self.ingest_heapam_record(&mut buf, modification, decoded)
+            self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
                 .await?;
         }
         // Handle other special record types
@@ -106,13 +112,14 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(modification, &create).await?;
+            self.ingest_xlog_smgr_create(modification, &create, ctx)
+                .await?;
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate)
+            self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
@@ -126,7 +133,7 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
@@ -134,7 +141,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -150,7 +159,7 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
@@ -158,7 +167,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             }
@@ -176,12 +187,13 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec)
+                self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
@@ -193,6 +205,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
@@ -204,6 +217,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
+                    ctx,
                 )
                 .await?;
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
@@ -213,10 +227,12 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                modification.drop_twophase_file(parsed_xact.xid).await?;
+                modification
+                    .drop_twophase_file(parsed_xact.xid, ctx)
+                    .await?;
             } else if info == pg_constants::XLOG_XACT_PREPARE {
                 modification
-                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))
+                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
@@ -232,6 +248,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
@@ -244,6 +261,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
@@ -251,12 +269,12 @@ impl<'a> WalIngest<'a> {
                 self.ingest_multixact_create_record(modification, &xlrec)?;
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(modification, &xlrec)
+                self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(modification, &xlrec, decoded)
+            self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -292,7 +310,7 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(modification, lsn, decoded, blk)
+            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
                 .await?;
         }
 
@@ -317,6 +335,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
@@ -359,14 +378,14 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                 .await?;
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec)
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec, ctx)
                 .await?;
         }
         Ok(())
@@ -377,6 +396,7 @@ impl<'a> WalIngest<'a> {
         buf: &mut Bytes,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Handle VM bit updates that are implicitly part of heap records.
 
@@ -456,7 +476,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -481,6 +501,7 @@ impl<'a> WalIngest<'a> {
                             old_heap_blkno,
                             flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                         },
+                        ctx,
                     )
                     .await?;
                 } else {
@@ -496,6 +517,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno: None,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -509,6 +531,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -524,6 +547,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlCreateDatabase,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
         let tablespace_id = rec.tablespace_id;
@@ -539,7 +563,7 @@ impl<'a> WalIngest<'a> {
 
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)
+            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -547,10 +571,10 @@ impl<'a> WalIngest<'a> {
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
         modification
-            .put_relmap_file(tablespace_id, db_id, filemap)
+            .put_relmap_file(tablespace_id, db_id, filemap, ctx)
             .await?;
 
         let mut num_rels_copied = 0;
@@ -561,7 +585,7 @@ impl<'a> WalIngest<'a> {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, req_lsn, true)
+                .get_rel_size(src_rel, req_lsn, true, ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -570,7 +594,7 @@ impl<'a> WalIngest<'a> {
                 forknum: src_rel.forknum,
             };
 
-            modification.put_rel_creation(dst_rel, nblocks).await?;
+            modification.put_rel_creation(dst_rel, nblocks, ctx).await?;
 
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
@@ -579,7 +603,7 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -599,6 +623,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrCreate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let rel = RelTag {
             spcnode: rec.rnode.spcnode,
@@ -606,7 +631,7 @@ impl<'a> WalIngest<'a> {
             relnode: rec.rnode.relnode,
             forknum: rec.forknum,
         };
-        self.put_rel_creation(modification, rel).await?;
+        self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
     }
 
@@ -617,6 +642,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let spcnode = rec.rnode.spcnode;
         let dbnode = rec.rnode.dbnode;
@@ -629,7 +655,7 @@ impl<'a> WalIngest<'a> {
                 relnode,
                 forknum: MAIN_FORKNUM,
             };
-            self.put_rel_truncation(modification, rel, rec.blkno)
+            self.put_rel_truncation(modification, rel, rec.blkno, ctx)
                 .await?;
         }
         if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
@@ -648,10 +674,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
-                self.put_rel_truncation(modification, rel, fsm_physical_page_no)
+                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                     .await?;
             }
         }
@@ -670,10 +696,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
-                self.put_rel_truncation(modification, rel, vm_page_no)
+                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                     .await?;
             }
         }
@@ -687,6 +713,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -745,10 +772,10 @@ impl<'a> WalIngest<'a> {
                 let last_lsn = self.timeline.get_last_record_lsn();
                 if modification
                     .tline
-                    .get_rel_exists(rel, last_lsn, true)
+                    .get_rel_exists(rel, last_lsn, true, ctx)
                     .await?
                 {
-                    self.put_rel_drop(modification, rel).await?;
+                    self.put_rel_drop(modification, rel, ctx).await?;
                 }
             }
         }
@@ -759,6 +786,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlClogTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
@@ -799,16 +827,15 @@ impl<'a> WalIngest<'a> {
         // it. So we use the previous record's LSN in the get calls
         // instead.
         let req_lsn = modification.tline.get_last_record_lsn();
-
-        let slru_segments = modification
+        for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)
-            .await?;
-        for segno in slru_segments {
+            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
+            .await?
+        {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                 modification
-                    .drop_slru_segment(SlruKind::Clog, segno)
+                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
                     .await?;
                 trace!("Drop CLOG segment {:>04X}", segno);
             }
@@ -900,6 +927,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
+        ctx: &RequestContext,
     ) -> Result<()> {
         self.checkpoint.oldestMulti = xlrec.end_trunc_off;
         self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
@@ -915,7 +943,7 @@ impl<'a> WalIngest<'a> {
         // contain, possibly partially, valid data.
         while segment != endsegment {
             modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)
+                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
                 .await?;
 
             /* move to next segment, handling wraparound correctly */
@@ -937,6 +965,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         xlrec: &XlRelmapUpdate,
         decoded: &DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> Result<()> {
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -944,18 +973,22 @@ impl<'a> WalIngest<'a> {
         buf.advance(12);
 
         modification
-            .put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))
-            .await?;
-
-        Ok(())
+            .put_relmap_file(
+                xlrec.tsid,
+                xlrec.dbid,
+                Bytes::copy_from_slice(&buf[..]),
+                ctx,
+            )
+            .await
     }
 
     async fn put_rel_creation(
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_creation(rel, 0).await?;
+        modification.put_rel_creation(rel, 0, ctx).await?;
         Ok(())
     }
 
@@ -965,8 +998,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_page_image(rel, blknum, img)?;
         Ok(())
     }
@@ -977,8 +1012,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -988,8 +1025,9 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        modification.put_rel_truncation(rel, nblocks).await?;
+        modification.put_rel_truncation(rel, nblocks, ctx).await?;
         Ok(())
     }
 
@@ -997,17 +1035,22 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_drop(rel).await?;
+        modification.put_rel_drop(rel, ctx).await?;
         Ok(())
     }
 
-    async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
-        let exists = self.timeline.get_rel_exists(rel, lsn, true).await?;
-        let nblocks = if !exists {
+    async fn get_relsize(
+        &mut self,
+        rel: RelTag,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
             0
         } else {
-            self.timeline.get_rel_size(rel, lsn, true).await?
+            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
         };
         Ok(nblocks)
     }
@@ -1017,23 +1060,28 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true).await? {
+        let old_nblocks = if !self
+            .timeline
+            .get_rel_exists(rel, last_lsn, true, ctx)
+            .await?
+        {
             // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0).await?;
+            modification.put_rel_creation(rel, 0, ctx).await?;
             0
         } else {
-            self.timeline.get_rel_size(rel, last_lsn, true).await?
+            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
         };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            modification.put_rel_extend(rel, new_nblocks).await?;
+            modification.put_rel_extend(rel, new_nblocks, ctx).await?;
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
@@ -1050,8 +1098,9 @@ impl<'a> WalIngest<'a> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_slru_extend(modification, kind, segno, blknum)
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_slru_extend(modification, kind, segno, blknum, ctx)
             .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;
         Ok(())
@@ -1063,6 +1112,7 @@ impl<'a> WalIngest<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
@@ -1075,17 +1125,17 @@ impl<'a> WalIngest<'a> {
         let last_lsn = self.timeline.get_last_record_lsn();
         let old_nblocks = if !self
             .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)
+            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
             modification
-                .put_slru_segment_creation(kind, segno, 0)
+                .put_slru_segment_creation(kind, segno, 0, ctx)
                 .await?;
             0
         } else {
             self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn)
+                .get_slru_segment_size(kind, segno, last_lsn, ctx)
                 .await?
         };
 
@@ -1134,41 +1184,44 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
+    async fn init_walingest_test<'a>(
+        tline: &'a Timeline,
+        ctx: &RequestContext,
+    ) -> Result<WalIngest<'a>> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
-        m.put_relmap_file(0, 111, Bytes::from("")).await?; // dummy relmapper file
+        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
+        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
 
         Ok(walingest)
     }
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let tenant = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
-        walingest.put_rel_creation(&mut m, TESTREL_A).await?;
+        walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
+            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
             .await?;
         m.commit()?;
 
@@ -1176,120 +1229,157 @@ mod tests {
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
-
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
 
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate last block
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 2).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
+            .await?;
         m.commit()?;
         assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate to zero length
         let mut m = tline.begin_modification(Lsn(0x68));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 0).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
+            .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false).await?, 0);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
+                .await?,
+            0
+        );
 
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
+                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1")
         );
@@ -1297,21 +1387,26 @@ mod tests {
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?, 1501);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
+            1501
+        );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1500")
         );
@@ -1323,31 +1418,40 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_drop(&mut m, TESTREL_A).await?;
+        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
         m.commit()?;
 
         // Check that rel is not visible anymore
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x30), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
+                .await?,
             false
         );
 
@@ -1357,16 +1461,23 @@ mod tests {
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x40), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
+            1
+        );
 
         Ok(())
     }
@@ -1376,9 +1487,9 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1386,27 +1497,33 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             relsize
         );
 
@@ -1416,7 +1533,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1425,18 +1542,25 @@ mod tests {
         // Truncate relation so that second segment was dropped
         // - only leave one page
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 1).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
+            .await?;
         m.commit()?;
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            1
+        );
 
         for blkno in 0..1 {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1444,7 +1568,9 @@ mod tests {
 
         // should still see all blocks with older LSN
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
             relsize
         );
         for blkno in 0..relsize {
@@ -1452,7 +1578,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1465,17 +1591,21 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             relsize
         );
         // Check relation content
@@ -1484,7 +1614,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1497,9 +1627,9 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let tenant = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1507,7 +1637,7 @@ mod tests {
             let mut m = tline.begin_modification(Lsn(lsn));
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
+                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                 .await?;
             m.commit()?;
         }
@@ -1515,7 +1645,7 @@ mod tests {
         assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE + 1
         );
 
@@ -1523,11 +1653,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1536,11 +1666,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1552,11 +1682,11 @@ mod tests {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
             walingest
-                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)
+                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                 .await?;
             m.commit()?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                 size as BlockNumber
             );
 
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index fd0524016f..c943bf0a27 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,16 +22,18 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
+use std::collections::VecDeque;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
+use std::os::fd::RawFd;
 use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::Mutex;
+use std::sync::{Mutex, MutexGuard};
 use std::time::Duration;
 use std::time::Instant;
 use std::{fs, io};
@@ -90,6 +92,20 @@ pub trait WalRedoManager: Send + Sync {
     ) -> Result<Bytes, WalRedoError>;
 }
 
+struct ProcessInput {
+    child: NoLeakChild,
+    stdin: ChildStdin,
+    stderr_fd: RawFd,
+    stdout_fd: RawFd,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
 ///
 /// This is the real implementation that uses a Postgres process to
 /// perform WAL replay. Only one thread can use the process at a time,
@@ -101,7 +117,9 @@ pub struct PostgresRedoManager {
     tenant_id: TenantId,
     conf: &'static PageServerConf,
 
-    process: Mutex<Option<PostgresRedoProcess>>,
+    stdout: Mutex<Option<ProcessOutput>>,
+    stdin: Mutex<Option<ProcessInput>>,
+    stderr: Mutex<Option<ChildStderr>>,
 }
 
 /// Can this request be served by neon redo functions
@@ -209,16 +227,17 @@ impl PostgresRedoManager {
         PostgresRedoManager {
             tenant_id,
             conf,
-            process: Mutex::new(None),
+            stdin: Mutex::new(None),
+            stdout: Mutex::new(None),
+            stderr: Mutex::new(None),
         }
     }
 
     /// Launch process pre-emptively. Should not be needed except for benchmarking.
-    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
-        let inner = self.process.get_mut().unwrap();
-        if inner.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *inner = Some(p);
+    pub fn launch_process(&self, pg_version: u32) -> anyhow::Result<()> {
+        let mut proc = self.stdin.lock().unwrap();
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
         Ok(())
     }
@@ -241,22 +260,19 @@ impl PostgresRedoManager {
 
         let start_time = Instant::now();
 
-        let mut process_guard = self.process.lock().unwrap();
+        let mut proc = self.stdin.lock().unwrap();
         let lock_time = Instant::now();
 
         // launch the WAL redo process on first use
-        if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *process_guard = Some(p);
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
-        let process = process_guard.as_mut().unwrap();
-
         WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
 
         // Relational WAL records are applied using wal-redo-postgres
         let buf_tag = BufferTag { rel, blknum };
-        let result = process
-            .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
+        let result = self
+            .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
             .map_err(WalRedoError::IoError);
 
         let end_time = Instant::now();
@@ -295,8 +311,22 @@ impl PostgresRedoManager {
 				base_img_lsn,
                 lsn
             );
-            let process = process_guard.take().unwrap();
-            process.kill();
+            // self.stdin only holds stdin & stderr as_raw_fd().
+            // Dropping it as part of take() doesn't close them.
+            // The owning objects (ChildStdout and ChildStderr) are stored in
+            // self.stdout and self.stderr, respsectively.
+            // We intentionally keep them open here to avoid a race between
+            // currently running `apply_wal_records()` and a `launch()` call
+            // after we return here.
+            // The currently running `apply_wal_records()` must not read from
+            // the newly launched process.
+            // By keeping self.stdout and self.stderr open here, `launch()` will
+            // get other file descriptors for the new child's stdout and stderr,
+            // and hence the current `apply_wal_records()` calls will observe
+            //  `output.stdout.as_raw_fd() != stdout_fd` .
+            if let Some(proc) = self.stdin.lock().unwrap().take() {
+                proc.child.kill_and_wait();
+            }
         }
         result
     }
@@ -595,32 +625,23 @@ impl<C: CommandExt> CloseFileDescriptors for C {
     }
 }
 
-///
-/// Handle to the Postgres WAL redo process
-///
-struct PostgresRedoProcess {
-    tenant_id: TenantId,
-    child: NoLeakChild,
-    stdin: ChildStdin,
-    stdout: ChildStdout,
-    stderr: ChildStderr,
-}
-
-impl PostgresRedoProcess {
+impl PostgresRedoManager {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%self.tenant_id, pg_version=pg_version))]
     fn launch(
-        conf: &PageServerConf,
-        tenant_id: TenantId,
+        &self,
+        input: &mut MutexGuard<Option<ProcessInput>>,
         pg_version: u32,
-    ) -> Result<PostgresRedoProcess, Error> {
+    ) -> Result<(), Error> {
         // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
         // just create one with constant name. That fails if you try to launch more than
         // one WAL redo manager concurrently.
         let datadir = path_with_suffix_extension(
-            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
+            self.conf
+                .tenant_path(&self.tenant_id)
+                .join("wal-redo-datadir"),
             TEMP_FILE_SUFFIX,
         );
 
@@ -634,10 +655,12 @@ impl PostgresRedoProcess {
                 )
             })?;
         }
-        let pg_bin_dir_path = conf
+        let pg_bin_dir_path = self
+            .conf
             .pg_bin_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
-        let pg_lib_dir_path = conf
+        let pg_lib_dir_path = self
+            .conf
             .pg_lib_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
@@ -723,27 +746,31 @@ impl PostgresRedoProcess {
         // all fallible operations post-spawn are complete, so get rid of the guard
         let child = scopeguard::ScopeGuard::into_inner(child);
 
-        Ok(PostgresRedoProcess {
-            tenant_id,
+        **input = Some(ProcessInput {
             child,
+            stdout_fd: stdout.as_raw_fd(),
+            stderr_fd: stderr.as_raw_fd(),
             stdin,
+            n_requests: 0,
+        });
+
+        *self.stdout.lock().unwrap() = Some(ProcessOutput {
             stdout,
-            stderr,
-        })
+            pending_responses: VecDeque::new(),
+            n_processed_responses: 0,
+        });
+        *self.stderr.lock().unwrap() = Some(stderr);
+
+        Ok(())
     }
 
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
-    fn kill(self) {
-        self.child.kill_and_wait();
-    }
-
-    //
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
     fn apply_wal_records(
-        &mut self,
+        &self,
+        mut input: MutexGuard<Option<ProcessInput>>,
         tag: BufferTag,
         base_img: Option<Bytes>,
         records: &[(Lsn, NeonWalRecord)],
@@ -780,33 +807,23 @@ impl PostgresRedoProcess {
         build_get_page_msg(tag, &mut writebuf);
         WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
 
-        // The input is now in 'writebuf'. Do a blind write first, writing as much as
-        // we can, before calling poll(). That skips one call to poll() if the stdin is
-        // already available for writing, which it almost certainly is because the
-        // process is idle.
-        let mut nwrite = self.stdin.write(&writebuf)?;
-
-        // We expect the WAL redo process to respond with an 8k page image. We read it
-        // into this buffer.
-        let mut resultbuf = vec![0; BLCKSZ.into()];
-        let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+        let proc = input.as_mut().unwrap();
+        let mut nwrite = 0usize;
+        let stdout_fd = proc.stdout_fd;
 
         // Prepare for calling poll()
         let mut pollfds = [
-            PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
+            PollFd::new(stdout_fd, PollFlags::POLLIN),
         ];
 
-        // We do three things simultaneously: send the old base image and WAL records to
-        // the child process's stdin, read the result from child's stdout, and forward any logging
+        // We do two things simultaneously: send the old base image and WAL records to
+        // the child process's stdin and forward any logging
         // information that the child writes to its stderr to the page server's log.
-        while nresult < BLCKSZ.into() {
-            // If we have more data to write, wake up if 'stdin' becomes writeable or
-            // we have data to read. Otherwise only wake up if there's data to read.
-            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
+        while nwrite < writebuf.len() {
             let n = loop {
-                match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) {
+                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
                     Err(e) if e == nix::errno::Errno::EINTR => continue,
                     res => break res,
                 }
@@ -820,14 +837,16 @@ impl PostgresRedoProcess {
             let err_revents = pollfds[1].revents().unwrap();
             if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                 let mut errbuf: [u8; 16384] = [0; 16384];
-                let n = self.stderr.read(&mut errbuf)?;
+                let mut stderr_guard = self.stderr.lock().unwrap();
+                let stderr = stderr_guard.as_mut().unwrap();
+                let len = stderr.read(&mut errbuf)?;
 
                 // The message might not be split correctly into lines here. But this is
                 // good enough, the important thing is to get the message to the log.
-                if n > 0 {
+                if len > 0 {
                     error!(
                         "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..n])
+                        String::from_utf8_lossy(&errbuf[0..len])
                     );
 
                     // To make sure we capture all log from the process if it fails, keep
@@ -841,33 +860,157 @@ impl PostgresRedoProcess {
                 ));
             }
 
-            // If we have more data to write and 'stdin' is writeable, do write.
-            if nwrite < writebuf.len() {
-                let in_revents = pollfds[2].revents().unwrap();
-                if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                    nwrite += self.stdin.write(&writebuf[nwrite..])?;
-                } else if in_revents.contains(PollFlags::POLLHUP) {
-                    // We still have more data to write, but the process closed the pipe.
-                    return Err(Error::new(
-                        ErrorKind::BrokenPipe,
-                        "WAL redo process closed its stdin unexpectedly",
-                    ));
-                }
-            }
-
-            // If we have some data in stdout, read it to the result buffer.
-            let out_revents = pollfds[0].revents().unwrap();
-            if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                nresult += self.stdout.read(&mut resultbuf[nresult..])?;
-            } else if out_revents.contains(PollFlags::POLLHUP) {
+            // If 'stdin' is writeable, do write.
+            let in_revents = pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            } else if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
                 return Err(Error::new(
                     ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stdout unexpectedly",
+                    "WAL redo process closed its stdin unexpectedly",
                 ));
             }
         }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(input);
 
-        Ok(Bytes::from(resultbuf))
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output_guard = self.stdout.lock().unwrap();
+        let output = output_guard.as_mut().unwrap();
+        if output.stdout.as_raw_fd() != stdout_fd {
+            // If stdout file descriptor is changed then it means that walredo process is crashed and restarted.
+            // As far as ProcessInput and ProcessOutout are protected by different mutexes,
+            // it can happen that we send request to one process and waiting response from another.
+            // To prevent such situation we compare stdout file descriptors.
+            // As far as old stdout pipe is destroyed only after new one is created,
+            // it can not reuse the same file descriptor, so this check is safe.
+            //
+            // Cross-read this with the comment in apply_batch_postgres if result.is_err().
+            // That's where we kill the child process.
+            return Err(Error::new(
+                ErrorKind::BrokenPipe,
+                "WAL redo process closed its stdout unexpectedly",
+            ));
+        }
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
+                        Err(e) if e == nix::errno::Errno::EINTR => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
+                }
+
+                // If we have some messages in stderr, forward them to the log.
+                let err_revents = pollfds[1].revents().unwrap();
+                if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    let mut errbuf: [u8; 16384] = [0; 16384];
+                    let mut stderr_guard = self.stderr.lock().unwrap();
+                    let stderr = stderr_guard.as_mut().unwrap();
+                    let len = stderr.read(&mut errbuf)?;
+
+                    // The message might not be split correctly into lines here. But this is
+                    // good enough, the important thing is to get the message to the log.
+                    if len > 0 {
+                        error!(
+                            "wal-redo-postgres: {}",
+                            String::from_utf8_lossy(&errbuf[0..len])
+                        );
+
+                        // To make sure we capture all log from the process if it fails, keep
+                        // reading from the stderr, before checking the stdout.
+                        continue;
+                    }
+                } else if err_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stderr unexpectedly",
+                    ));
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = pollfds[2].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                } else if out_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stdout unexpectedly",
+                    ));
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
     }
 }
 
diff --git a/poetry.lock b/poetry.lock
index edbcddd576..fc37124184 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,3 +1,21 @@
+[[package]]
+name = "aiohttp"
+version = "3.7.0"
+description = "Async http client/server framework (asyncio)"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+async-timeout = ">=3.0,<4.0"
+attrs = ">=17.3.0"
+chardet = ">=2.0,<4.0"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.0,<2.0"
+
+[package.extras]
+speedups = ["aiodns", "brotlipy", "cchardet"]
+
 [[package]]
 name = "aiopg"
 version = "1.3.4"
@@ -41,11 +59,11 @@ six = ">=1.9.0"
 
 [[package]]
 name = "async-timeout"
-version = "4.0.2"
+version = "3.0.1"
 description = "Timeout context manager for asyncio programs"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.5.3"
 
 [[package]]
 name = "asyncpg"
@@ -560,6 +578,14 @@ networkx = ">=2.4,<3.0"
 pyyaml = ">5.4"
 sarif-om = ">=1.0.4,<1.1.0"
 
+[[package]]
+name = "chardet"
+version = "3.0.4"
+description = "Universal encoding detector for Python 2 and 3"
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "charset-normalizer"
 version = "2.1.0"
@@ -939,6 +965,14 @@ server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)"
 ssm = ["PyYAML (>=5.1)", "dataclasses"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
+[[package]]
+name = "multidict"
+version = "6.0.4"
+description = "multidict implementation"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "mypy"
 version = "0.991"
@@ -1580,6 +1614,18 @@ category = "main"
 optional = false
 python-versions = ">=3.4"
 
+[[package]]
+name = "yarl"
+version = "1.8.2"
+description = "Yet another URL library"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+
 [[package]]
 name = "zipp"
 version = "3.8.1"
@@ -1595,9 +1641,44 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70"
+content-hash = "0f7289ef9439d1d7cd36b07efb53741b773669b0f860189c800270b7def0c241"
 
 [metadata.files]
+aiohttp = [
+    {file = "aiohttp-3.7.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:72fe89f7e14939e896d984c4b592580f8cdfa7497feb1c0c24639a9c60be3eb9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:fdf778d4c4bf976e69a37213fe8083613d0851976ddcf485bd7c0650a43d3852"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:fee7b5e68939ffc09f9b29f167ed49c8b50de3eee0a1d8108b439ddd9963af46"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:dd64634713be409202058f2ea267dfbcdd74b387b8793425f21ef0266d45d0e9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:713dd7fd70ddda9dc8d014c49dd0e55b58afe4e0cddb8722c7501f53edf30c3f"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:d31c43f7c4948ce01957f9a1ceee0784e067778477557ebccdf805398331c1a1"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:5e26d6003eb6df304608d9fd9c9437065a8532d869a3ffcbd8113a3d710f8239"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bf08462cddd10ddd8ffe5cb5c1638bfa051290909ebedb31c06e46578b9b7529"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:07bacf6721db51a4c6160ed3031a2a97910647969dafd7c653f600f3b542f463"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:245b58e30bc889d18b783db2f09ef1d814f466e15c84325410827451297003a0"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b392e5c3e122586c49cd8b9426f577bf4d51958933b839d158d28b69515af74e"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:5b5c320621a171aa85f96909af28fbb5286bd6842066db3062b083ba92261256"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:97d2341d1360dbe2c5b1d94922f7d68f9ce2ded1daab88b9bdeb49ce419cdc1b"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:beda23f292716887532661dc19abb9db2302ccfbd671a080cd8f4be7463d0841"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:cbcaae9a6f14f762348d19b2dce8162772c0b0a1739314e18492a308a22caf96"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7a49ef7b691babc83db126db874fbf26ba2f781899b91399f9ff8b235f059245"},
+    {file = "aiohttp-3.7.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f56892f57310415cf6a179eec3ea6c7a82a9d37fbc00894943ea3154011a6d2a"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:df1274b7620c32d3b15bfb0a8fb3165dd6cdc9c39f4db74d162f051c80826542"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a04ba359dc5f2e21b96bfc90c4a7665441441ba61b52e992b7799493889a3419"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:f548d7976d168f0f45ac5909ca5f606ae3f6f7aa1725b22504004a053b29a7d0"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:deef02e2a9f5095463098c7c22d5566f20a6e4e14fc0996c0c2efc74d461b680"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:fe44c96bc380588d36729392b602470d88a7c18e646e95dd4348cafe3900d91d"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:9210532e6e95b40d22a33415bb84423eef3f633b2d2339b97f3b26438eebc466"},
+    {file = "aiohttp-3.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:a586e476a251483d222c73dfb2f27df90bc4ea1b8c7da9396236510e0d4046c8"},
+    {file = "aiohttp-3.7.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:900012c5f12ff72b1453229afe288ddc9135176df8b3b3cc5b8f6cfde912aaa4"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:064d5f0738bcbab3e0c0ecf85c93b5ee1e07e124f994eaa03bf73687f3ecd9da"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:0a2edf27865e66a33f64fa793cd14d0aae8127ce20a858539e97c25b600556dc"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:eaa8ae734639d5a0a3b5e33a154b8bfef384cdc090706f95c387cae8b21af764"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:a8a42f05491d9c04a77806875a68f84fea9af7a59d47b7897cb166632f74606c"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:b19ded3f6957693b97ba8372aacb5b0021639bbd5e77b1e960796bcef5431969"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cefbd7ce7d1f1db43749a077e4970e29e2b631f367c9eff3862c3c886b4218dd"},
+    {file = "aiohttp-3.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d64f7dfd4e326d9b0d11b07fcd5ebf78844ba3c8f7699f38b50b0e0db0ae68f"},
+    {file = "aiohttp-3.7.0.tar.gz", hash = "sha256:176f1d2b2bc07044f4ed583216578a72a2bd35dffdeb92e0517d0aaa29d29549"},
+]
 aiopg = [
     {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
     {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
@@ -1611,8 +1692,8 @@ allure-python-commons = [
     {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
 ]
 async-timeout = [
-    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
-    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+    {file = "async-timeout-3.0.1.tar.gz", hash = "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f"},
+    {file = "async_timeout-3.0.1-py3-none-any.whl", hash = "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"},
 ]
 asyncpg = [
     {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
@@ -1787,6 +1868,10 @@ cfn-lint = [
     {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"},
     {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"},
 ]
+chardet = [
+    {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
+    {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
+]
 charset-normalizer = [
     {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"},
     {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"},
@@ -1960,6 +2045,82 @@ moto = [
     {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
     {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
 ]
+multidict = [
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
+    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
+    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
+    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
+    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
+    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
+    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
+    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
+    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
+    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
+    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
+    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
+    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+]
 mypy = [
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
@@ -2412,6 +2573,82 @@ xmltodict = [
     {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"},
     {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"},
 ]
+yarl = [
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
+    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
+    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
+    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
+    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
+    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
+    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
+    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
+    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
+    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
+    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
+    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
+    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+]
 zipp = [
     {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
     {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 5d44774df9..1b61ab108f 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -30,7 +30,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
 use tracing::{info, info_span, Instrument};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -49,7 +49,7 @@ async fn main() -> anyhow::Result<()> {
         .init();
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let arg_matches = cli().get_matches();
 
diff --git a/pyproject.toml b/pyproject.toml
index b4fb7a9e7d..a817e9dda5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
+aiohttp = "3.7"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index b130ea86bd..1a068412c8 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use utils::{
     id::NodeId,
     logging::{self, LogFormat},
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals, tcp_listener,
 };
 
@@ -173,7 +173,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.my_id.to_string())],
+    );
     start_safekeeper(conf)
 }
 
diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py
new file mode 100644
index 0000000000..5472d86d8f
--- /dev/null
+++ b/scripts/force_layer_download.py
@@ -0,0 +1,324 @@
+import argparse
+import asyncio
+import json
+import logging
+import signal
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Awaitable, Dict, List, Tuple
+
+import aiohttp
+
+
+class ClientException(Exception):
+    pass
+
+
+class Client:
+    def __init__(self, pageserver_api_endpoint: str, max_concurrent_layer_downloads: int):
+        self.endpoint = pageserver_api_endpoint
+        self.max_concurrent_layer_downloads = max_concurrent_layer_downloads
+        self.sess = aiohttp.ClientSession()
+
+    async def close(self):
+        await self.sess.close()
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_t, exc_v, exc_tb):
+        await self.close()
+
+    async def parse_response(self, resp, expected_type):
+        body = await resp.json()
+        if not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, expected_type):
+            raise ClientException(f"expecting {expected_type.__name__}")
+        return body
+
+    async def get_tenant_ids(self):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["id"] for t in payload]
+
+    async def get_timeline_ids(self, tenant_id):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant/{tenant_id}/timeline")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["timeline_id"] for t in payload]
+
+    async def timeline_spawn_download_remote_layers(self, tenant_id, timeline_id, ongoing_ok=False):
+        resp = await self.sess.post(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+            json={"max_concurrent_downloads": self.max_concurrent_layer_downloads},
+        )
+        body = await resp.json()
+        if resp.status == 409:
+            if not ongoing_ok:
+                raise ClientException("download already ongoing")
+            # response body has same shape for ongoing and newly created
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, dict):
+            raise ClientException("expecting dict")
+
+        return body
+
+    async def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id,
+        timeline_id,
+    ):
+        resp = await self.sess.get(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        body = await resp.json()
+
+        if resp.status == 404:
+            return None
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        return body
+
+
+@dataclass
+class Completed:
+    """The status dict returned by the API"""
+
+    status: Dict[str, Any]
+
+
+sigint_received = asyncio.Event()
+
+
+async def do_timeline(client: Client, tenant_id, timeline_id):
+    """
+    Spawn download_remote_layers task for given timeline,
+    then poll until the download has reached a terminal state.
+
+    If the terminal state is not 'Completed', the method raises an exception.
+    The caller is responsible for inspecting `failed_download_count`.
+
+    If there is already a task going on when this method is invoked,
+    it raises an exception.
+    """
+
+    # Don't start new downloads if user pressed SIGINT.
+    # This task will show up as "raised_exception" in the report.
+    if sigint_received.is_set():
+        raise Exception("not starting because SIGINT received")
+
+    # run downloads to completion
+
+    status = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+    if status is not None and status["state"] == "Running":
+        raise Exception("download is already running")
+
+    spawned = await client.timeline_spawn_download_remote_layers(
+        tenant_id, timeline_id, ongoing_ok=False
+    )
+
+    while True:
+        st = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+        logging.info(f"{tenant_id}:{timeline_id} state is: {st}")
+
+        if spawned["task_id"] != st["task_id"]:
+            raise ClientException("download task ids changed while polling")
+
+        if st["state"] == "Running":
+            await asyncio.sleep(10)
+            continue
+
+        if st["state"] != "Completed":
+            raise ClientException(
+                f"download task reached terminal state != Completed: {st['state']}"
+            )
+
+        return Completed(st)
+
+
+def handle_sigint():
+    logging.info("SIGINT received, asyncio event set. Will not start new downloads.")
+    global sigint_received
+    sigint_received.set()
+
+
+async def main(args):
+    async with Client(args.pageserver_http_endpoint, args.max_concurrent_layer_downloads) as client:
+        exit_code = await main_impl(args, args.report_output, client)
+
+    return exit_code
+
+
+async def taskq_handler(task_q, result_q):
+    while True:
+        try:
+            (id, fut) = task_q.get_nowait()
+        except asyncio.QueueEmpty:
+            logging.debug("taskq_handler observed empty task_q, returning")
+            return
+        logging.info(f"starting task {id}")
+        try:
+            res = await fut
+        except Exception as e:
+            res = e
+        result_q.put_nowait((id, res))
+
+
+async def print_progress(result_q, tasks):
+    while True:
+        await asyncio.sleep(10)
+        logging.info(f"{result_q.qsize()} / {len(tasks)} tasks done")
+
+
+async def main_impl(args, report_out, client: Client):
+    """
+    Returns OS exit status.
+    """
+    tenant_and_timline_ids: List[Tuple[str, str]] = []
+    # fill tenant_and_timline_ids based on spec
+    for spec in args.what:
+        comps = spec.split(":")
+        if comps == ["ALL"]:
+            logging.info("get tenant list")
+            tenant_ids = await client.get_tenant_ids()
+            get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids]
+            gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True)
+            assert len(tenant_ids) == len(gathered)
+            tenant_and_timline_ids = []
+            for tid, tlids in zip(tenant_ids, gathered):
+                for tlid in tlids:
+                    tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 1:
+            tid = comps[0]
+            tlids = await client.get_timeline_ids(tid)
+            for tlid in tlids:
+                tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 2:
+            tenant_and_timline_ids.append((comps[0], comps[1]))
+        else:
+            raise ValueError(f"invalid what-spec: {spec}")
+
+    logging.info("expanded spec:")
+    for tid, tlid in tenant_and_timline_ids:
+        logging.info(f"{tid}:{tlid}")
+
+    logging.info("remove duplicates after expanding spec")
+    tmp = list(set(tenant_and_timline_ids))
+    assert len(tmp) <= len(tenant_and_timline_ids)
+    if len(tmp) != len(tenant_and_timline_ids):
+        logging.info(f"spec had {len(tenant_and_timline_ids) - len(tmp)} duplicates")
+    tenant_and_timline_ids = tmp
+
+    logging.info("create tasks and process them at specified concurrency")
+    task_q: asyncio.Queue[Tuple[str, Awaitable[Any]]] = asyncio.Queue()
+    tasks = {
+        f"{tid}:{tlid}": do_timeline(client, tid, tlid) for tid, tlid in tenant_and_timline_ids
+    }
+    for task in tasks.items():
+        task_q.put_nowait(task)
+
+    result_q: asyncio.Queue[Tuple[str, Any]] = asyncio.Queue()
+    taskq_handlers = []
+    for _ in range(0, args.concurrent_tasks):
+        taskq_handlers.append(taskq_handler(task_q, result_q))
+
+    print_progress_task = asyncio.create_task(print_progress(result_q, tasks))
+
+    await asyncio.gather(*taskq_handlers)
+    print_progress_task.cancel()
+
+    logging.info("all tasks handled, generating report")
+
+    results = []
+    while True:
+        try:
+            results.append(result_q.get_nowait())
+        except asyncio.QueueEmpty:
+            break
+    assert task_q.empty()
+
+    report = defaultdict(list)
+    for id, result in results:
+        logging.info(f"result for {id}: {result}")
+        if isinstance(result, Completed):
+            if result.status["failed_download_count"] == 0:
+                report["completed_without_errors"].append(id)
+            else:
+                report["completed_with_download_errors"].append(id)
+        elif isinstance(result, Exception):
+            report["raised_exception"].append(id)
+        else:
+            raise ValueError("unexpected result type")
+    json.dump(report, report_out)
+
+    logging.info("--------------------------------------------------------------------------------")
+
+    report_success = len(report["completed_without_errors"]) == len(tenant_and_timline_ids)
+    if not report_success:
+        logging.error("One or more tasks encountered errors.")
+    else:
+        logging.info("All tasks reported success.")
+    logging.info("Inspect log for details and report file for JSON summary.")
+
+    return report_success
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--report-output",
+        type=argparse.FileType("w"),
+        default="-",
+        help="where to write report output (default: stdout)",
+    )
+    parser.add_argument(
+        "--pageserver-http-endpoint",
+        default="http://localhost:9898",
+        help="pageserver http endpoint, (default http://localhost:9898)",
+    )
+    parser.add_argument(
+        "--concurrent-tasks",
+        required=False,
+        default=5,
+        type=int,
+        help="Max concurrent download tasks created & polled by this script",
+    )
+    parser.add_argument(
+        "--max-concurrent-layer-downloads",
+        dest="max_concurrent_layer_downloads",
+        required=False,
+        default=8,
+        type=int,
+        help="Max concurrent download tasks spawned by pageserver. Each layer is a separate task.",
+    )
+
+    parser.add_argument(
+        "what",
+        nargs="+",
+        help="what to download: ALL|tenant_id|tenant_id:timeline_id",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="enable verbose logging",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    if args.verbose:
+        level = logging.DEBUG
+    logging.basicConfig(
+        format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+        datefmt="%Y-%m-%d:%H:%M:%S",
+        level=level,
+    )
+
+    loop = asyncio.get_event_loop()
+
+    loop.add_signal_handler(signal.SIGINT, handle_sigint)
+    sys.exit(asyncio.run(main(args)))
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 6d80e96bf1..e33369bbb1 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -45,7 +45,7 @@ use storage_broker::{
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -425,7 +425,7 @@ async fn http1_handler(
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let args = Args::parse();
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 8b78e06c22..bdaaa95216 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -46,6 +46,12 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_physical_size",
 )
 
+PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
+    "pageserver_storage_operations_seconds_global_count",
+    "pageserver_storage_operations_seconds_global_sum",
+    "pageserver_storage_operations_seconds_global_bucket",
+)
+
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
     "pageserver_resident_physical_size",
@@ -61,13 +67,13 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
-    "pageserver_storage_operations_seconds_bucket",
-    "pageserver_storage_operations_seconds_count",
-    "pageserver_storage_operations_seconds_sum",
+    "pageserver_storage_operations_seconds_count_total",
+    "pageserver_storage_operations_seconds_sum_total",
     "pageserver_wait_lsn_seconds_bucket",
     "pageserver_wait_lsn_seconds_count",
     "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
+    "pageserver_tenant_states_count",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 29cdcb18ce..cbbf01a285 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -2,7 +2,15 @@ from contextlib import closing
 
 import psycopg2.extras
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    LocalFsStorage,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    assert_tenant_status,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from fixtures.utils import wait_until
 
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
@@ -57,7 +65,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "compaction_period": 20,
                     "compaction_threshold": 10,
                     "gc_horizon": 67108864,
-                    "gc_period": 100,
+                    "gc_period": 60 * 60,
                     "image_creation_threshold": 3,
                     "pitr_interval": 604800,  # 7 days
                 }.items()
@@ -158,3 +166,46 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "pitr_interval": 60,
                 }.items()
             )
+
+
+def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_creating_tenant_conf_after_attach",
+    )
+
+    env = neon_env_builder.init_start()
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    # tenant is created with defaults, as in without config file
+    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
+    config_path = env.repo_dir / "tenants" / str(tenant_id) / "config"
+    assert config_path.exists(), "config file is always initially created"
+
+    http_client = env.pageserver.http_client()
+
+    detail = http_client.timeline_detail(tenant_id, timeline_id)
+    last_record_lsn = Lsn(detail["last_record_lsn"])
+    assert last_record_lsn.lsn_int != 0, "initdb must have executed"
+
+    wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
+
+    http_client.tenant_detach(tenant_id)
+
+    assert not config_path.exists(), "detach did not remove config file"
+
+    http_client.tenant_attach(tenant_id)
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_tenant_status(http_client, tenant_id, "Active"),
+    )
+
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"})
+    contents_first = config_path.read_text()
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"})
+    contents_later = config_path.read_text()
+
+    # dont test applying the setting here, we have that another test case to show it
+    # we just care about being able to create the file
+    assert len(contents_first) > len(contents_later)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index db5bb679f2..6c3454b79b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -6,6 +6,7 @@ from threading import Thread
 import asyncpg
 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
@@ -59,11 +60,11 @@ def test_tenant_reattach(
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t(key int primary key, value text)")
-        cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t(key int primary key, value text)")
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
     # Wait for the all data to be processed by the pageserver and uploaded in remote storage
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
@@ -78,15 +79,34 @@ def test_tenant_reattach(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    tenant_metric_filter = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+    }
+    pageserver_last_record_lsn_before_detach = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
     pageserver_http.tenant_detach(tenant_id)
     pageserver_http.tenant_attach(tenant_id)
 
-    with pg.cursor() as cur:
-        assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+    time.sleep(1)  # for metrics propagation
 
-    # Check that we had to retry the downloads
-    assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
-    assert env.pageserver.log_contains(".*download.*failed, will retry.*")
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    pageserver_last_record_lsn = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
+    assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+
+        # Check that we had to retry the downloads
+        assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
 num_connections = 10
@@ -237,7 +257,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found")
+    env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
 
     # first check for non existing tenant
     tenant_id = TenantId.generate()
@@ -272,8 +292,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
 
-        # the error will be printed to the log too
+    # the error will be printed to the log too
     env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
+    # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
 
     # Detach while running manual GC.
     # It should wait for manual GC to finish because it runs in a task associated with the tenant.
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 9477ae3c25..e56bb1b469 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import time
 from contextlib import closing
 from datetime import datetime
 from pathlib import Path
@@ -8,6 +9,7 @@ from typing import List
 import pytest
 from fixtures.log_helper import log
 from fixtures.metrics import (
+    PAGESERVER_GLOBAL_METRICS,
     PAGESERVER_PER_TENANT_METRICS,
     PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     parse_metrics,
@@ -160,6 +162,14 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
             f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
         )
 
+    # Test (a subset of) pageserver global metrics
+    for metric in PAGESERVER_GLOBAL_METRICS:
+        ps_samples = ps_metrics.query_all(metric, {})
+        assert len(ps_samples) > 0
+        for sample in ps_samples:
+            labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
+            log.info(f"{sample.name}{{{labels}}} {sample.value}")
+
 
 @pytest.mark.parametrize(
     "remote_storage_kind",
@@ -259,7 +269,7 @@ def test_pageserver_with_empty_tenants(
         files_in_timelines_dir == 0
     ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
 
-    # Trigger timeline reinitialization after pageserver restart
+    # Trigger timeline re-initialization after pageserver restart
     env.postgres.stop_all()
     env.pageserver.stop()
 
@@ -278,7 +288,51 @@ def test_pageserver_with_empty_tenants(
         broken_tenant["state"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
+    broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
+    assert (
+        broken_tenant_status["state"] == "Broken"
+    ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
+
+    assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
+
     [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
     assert (
         loaded_tenant["state"] == "Active"
     ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
+
+    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
+    assert (
+        loaded_tenant_status["state"] == "Active"
+    ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
+
+    time.sleep(1)  # to allow metrics propagation
+
+    ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
+    broken_tenants_metric_filter = {
+        "tenant_id": str(tenant_without_timelines_dir),
+        "state": "broken",
+    }
+    active_tenants_metric_filter = {
+        "tenant_id": str(tenant_with_empty_timelines_dir),
+        "state": "active",
+    }
+
+    tenant_active_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=active_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_active_count == 1
+    ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
+
+    tenant_broken_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=broken_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_broken_count == 1
+    ), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f4b71ae9b7..3a852b2207 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -20,7 +20,9 @@ clap = { version = "4", features = ["derive", "string"] }
 crossbeam-utils = { version = "0.8" }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
+futures-executor = { version = "0.3" }
 futures-task = { version = "0.3", default-features = false, features = ["std"] }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
@@ -31,17 +33,21 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128", "libm"] }
+num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+ring = { version = "0.16", features = ["std"] }
+rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
+tonic = { version = "0.8", features = ["tls-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }