Merge pull request #3656 from neondatabase/releases/2023-02-21

Release 2023-02-21
2026-06-04 22:10:39 +00:00 · 2023-02-21 16:03:48 +01:00
parent a41b5244a8 6508540561
commit acbf4148ea
80 changed files with 3051 additions and 2097 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,3 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!vm-cgconfig.conf
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -2,11 +2,11 @@ storage:
  vars:
    bucket_name: neon-prod-storage-ap-southeast-1
    bucket_region: ap-southeast-1
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
      remote_storage:
        bucket_name: "{{ bucket_name }}"
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -2,11 +2,11 @@ storage:
  vars:
    bucket_name: neon-prod-storage-eu-central-1
    bucket_region: eu-central-1
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
      remote_storage:
        bucket_name: "{{ bucket_name }}"
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -2,11 +2,11 @@ storage:
  vars:
    bucket_name: neon-prod-storage-us-east-2
    bucket_region: us-east-2
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
      remote_storage:
        bucket_name: "{{ bucket_name }}"
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -2,11 +2,11 @@ storage:
  vars:
    bucket_name: neon-prod-storage-us-west-2
    bucket_region: us-west-2
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
      remote_storage:
        bucket_name: "{{ bucket_name }}"
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -1,40 +0,0 @@
---
-storage:
-  vars:
-    console_mgmt_base_url: http://console-release.local
-    bucket_name: zenith-storage-oregon
-    bucket_region: us-west-2
-    broker_endpoint: http://storage-broker.prod.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: prod-1/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        zenith-1-ps-2:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-3:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-4:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-5:
-          console_region_id: aws-us-west-2
-
-    safekeepers:
-      hosts:
-        zenith-1-sk-1:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-2:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-4:
-          console_region_id: aws-us-west-2
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -2,12 +2,17 @@ storage:
  vars:
    bucket_name: neon-dev-storage-eu-west-1
    bucket_region: eu-west-1
-    console_mgmt_base_url: http://console-staging.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.build
    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: "20m"
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -2,12 +2,17 @@ storage:
  vars:
    bucket_name: neon-staging-storage-us-east-2
    bucket_region: us-east-2
-    console_mgmt_base_url: http://console-staging.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.build
    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: "20m"
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.eu-west-1.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
  metricCollectionInterval: "1min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -10,7 +10,7 @@ settings:
  uri: "https://console.stage.neon.tech/psql_session/"
  domain: "pg.neon.build"
  sentryEnvironment: "staging"
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
  metricCollectionInterval: "1min"

 # -- Additional labels for neon-proxy-link pods
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.cloud.stage.neon.tech"
  sentryEnvironment: "staging"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
  metricCollectionInterval: "1min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.us-east-2.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
  metricCollectionInterval: "1min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.ap-southeast-1.aws.neon.tech"
  sentryEnvironment: "production"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
  metricCollectionInterval: "10min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.eu-central-1.aws.neon.tech"
  sentryEnvironment: "production"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
  metricCollectionInterval: "10min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.us-east-2.aws.neon.tech"
  sentryEnvironment: "production"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
  metricCollectionInterval: "10min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.cloud.neon.tech"
  sentryEnvironment: "production"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
  metricCollectionInterval: "10min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:

 settings:
  authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
  domain: "*.us-west-2.aws.neon.tech"
  sentryEnvironment: "production"
  wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
  metricCollectionInterval: "10min"

 # -- Additional labels for neon-proxy pods
--- a/.github/helm-values/production.neon-storage-broker.yaml
+++ b/.github/helm-values/production.neon-storage-broker.yaml
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -611,34 +611,31 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_INFORMANT_VERSION: 0.1.1
+      VM_BUILDER_VERSION: v0.4.6

    steps:
-      - name: Downloading latest vm-builder
+      - name: Checkout
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 0
+
+      - name: Downloading vm-builder
        run: |
-          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

      - name: Pulling compute-node image
        run: |
          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

-      - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
+      - name: Building VM compute-node rootfs
        run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
-          chmod +x vm-informant
-
-      - name: Adding VM informant to compute-node image
-        run: |
-          ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
-          docker cp vm-informant $ID:/bin/vm-informant
-          docker commit $ID temp-vm-compute-node
-          docker rm -f $ID
+          docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .

      - name: Build vm image
        run: |
          # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
-          ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -165,80 +165,3 @@ jobs:
      - name: Deploy storage-broker
        run:
          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  # Deploy to old account below          
-
-  deploy:
-    runs-on: prod
-    container:
-      image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-      options: --user root --privileged
-    if: inputs.deployStorage && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{ inputs.dockerTag }}
-          cd "$(pwd)/.github/ansible"
-
-          ./get_binaries.sh
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
-      - name: Cleanup ansible folder
-        run: rm -rf ~/.ansible
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, gen3, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Add neon helm chart
-        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-      - name: Cleanup helm folder
-        run: rm -rf ~/.cache
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2110,6 +2110,16 @@ version = "0.3.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"

+[[package]]
+name = "mime_guess"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
+dependencies = [
+ "mime",
+ "unicase",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -2884,6 +2894,7 @@ dependencies = [
 "md5",
 "metrics",
 "once_cell",
+ "opentelemetry",
 "parking_lot",
 "pin-project-lite",
 "pq_proto",
@@ -2892,6 +2903,8 @@ dependencies = [
 "rcgen",
 "regex",
 "reqwest",
+ "reqwest-middleware",
+ "reqwest-tracing",
 "routerify",
 "rstest",
 "rustls",
@@ -2909,7 +2922,9 @@ dependencies = [
 "tokio-postgres-rustls",
 "tokio-rustls",
 "tracing",
+ "tracing-opentelemetry",
 "tracing-subscriber",
+ "tracing-utils",
 "url",
 "utils",
 "uuid",
@@ -3079,6 +3094,7 @@ dependencies = [
 "js-sys",
 "log",
 "mime",
+ "mime_guess",
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
@@ -3098,6 +3114,36 @@ dependencies = [
 "winreg",
 ]

+[[package]]
+name = "reqwest-middleware"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "http",
+ "reqwest",
+ "serde",
+ "task-local-extensions",
+ "thiserror",
+]
+
+[[package]]
+name = "reqwest-tracing"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce"
+dependencies = [
+ "async-trait",
+ "opentelemetry",
+ "reqwest",
+ "reqwest-middleware",
+ "task-local-extensions",
+ "tracing",
+ "tracing-opentelemetry",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3790,6 +3836,15 @@ dependencies = [
 "xattr",
 ]

+[[package]]
+name = "task-local-extensions"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4"
+dependencies = [
+ "tokio",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.3.0"
@@ -3809,6 +3864,8 @@ name = "tenant_size_model"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "serde",
+ "serde_json",
 "workspace_hack",
 ]

@@ -4358,6 +4415,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.10"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,6 +76,8 @@ prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
+reqwest-middleware = "0.2.0"
 routerify = "3"
 rpds = "0.12.0"
 rustls = "0.20"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -11,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev
+    libicu-dev libxslt1-dev

 #########################################################################################
 #
@@ -23,7 +23,8 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
+    --with-libxml --with-libxslt && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
@@ -34,7 +35,8 @@ RUN cd postgres && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control

 #########################################################################################
 #
@@ -50,15 +52,15 @@ RUN apt update && \
    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
    protobuf-c-compiler xsltproc

-RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
-    tar zxvf SFCGAL-v1.3.10.tar.gz && \
-    cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
+RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
    make clean && cp -R /sfcgal/* /

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
+    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    ./autogen.sh && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -83,29 +85,14 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
+    apt install -y ninja-build python3-dev libncurses5 binutils clang

-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
+    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
+    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -126,20 +113,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2
      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
      && rm /tmp/cmake-install.sh

-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
+RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
+    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir build && cd build && \
    cmake .. -DCMAKE_BUILD_TYPE=Release && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/h3 make install && \
    cp -R /h3/usr / && \
    rm -rf build

-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -155,9 +139,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
 FROM build-deps AS unit-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
-    tar xvzf 7.7.tar.gz && \
-    cd postgresql-unit-7.7 && \
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -176,12 +159,27 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \
-    cd pgvector && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
+    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

+#########################################################################################
+#
+# Layer "pgjwt-pg-build"
+# compile pgjwt extension
+#
+#########################################################################################
+FROM build-deps AS pgjwt-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
+RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -196,6 +194,7 @@ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -258,6 +257,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libicu67, locales for collations (including ICU)
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
+# libxml2, libxslt1.1 for xml2
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        locales \
@@ -269,6 +269,8 @@ RUN apt update &&  \
        libproj19 \
        libprotobuf-c1 \
        libsfcgal1 \
+        libxml2 \
+        libxslt1.1 \
        gdb && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/Dockerfile.vm-compute-node
+++ b/Dockerfile.vm-compute-node
@@ -0,0 +1,32 @@
+# Note: this file *mostly* just builds on Dockerfile.compute-node
+
+ARG SRC_IMAGE
+ARG VM_INFORMANT_VERSION=v0.1.6
+
+# Pull VM informant and set up inittab
+FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
+
+RUN set -e \
+	&& rm -f /etc/inittab \
+	&& touch /etc/inittab
+
+ADD vm-cgconfig.conf /etc/cgconfig.conf
+RUN set -e \
+	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
+	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
+
+# Combine, starting from non-VM compute node image.
+FROM $SRC_IMAGE as base
+
+# Temporarily set user back to root so we can run apt update and adduser
+USER root
+RUN apt update && \
+	apt install --no-install-recommends -y \
+        cgroup-tools
+RUN adduser vm-informant --disabled-password --no-create-home
+USER postgres
+
+COPY --from=informant /etc/inittab /etc/inittab
+COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
+
+ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
--- a/12
+++ b/12
@@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-%

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
--- a/README.md
+++ b/README.md
@@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
  protobuf-devel
 ```
+* On Arch based systems, these packages are needed:
+```bash
+pacman -S base-devel readline zlib libseccomp openssl clang \
+postgresql-libs cmake postgresql protobuf
+```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
@@ -83,9 +88,10 @@ cd neon

 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`nproc`"
+# use "BUILD_TYPE=release make -j`nproc` -s"
+# Remove -s for the verbose build log

-make -j`nproc`
+make -j`nproc` -s
 ```

 #### Building on OSX
@@ -99,9 +105,10 @@ cd neon

 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
+# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
+# Remove -s for the verbose build log

-make -j`sysctl -n hw.logicalcpu`
+make -j`sysctl -n hw.logicalcpu` -s
 ```

 #### Dependency installation notes
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,7 +44,6 @@ use tracing::{error, info};

 use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
 use compute_tools::http::api::launch_http_server;
-use compute_tools::informant::spawn_vm_informant_if_present;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -141,8 +140,6 @@ fn main() -> Result<()> {
    // requests, while configuration is still in progress.
    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
-    // Also spawn the thread responsible for handling the VM informant -- if it's present
-    let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");

    // Start Postgres
    let mut delay_exit = false;
--- a/compute_tools/src/informant.rs
+++ b/compute_tools/src/informant.rs
@@ -1,50 +0,0 @@
-use std::path::Path;
-use std::process;
-use std::thread;
-use std::time::Duration;
-use tracing::{info, warn};
-
-use anyhow::{Context, Result};
-
-const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
-const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
-
-/// Launch a thread to start the VM informant if it's present (and restart, on failure)
-pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
-    let exists = Path::new(VM_INFORMANT_PATH)
-        .try_exists()
-        .context("could not check if path exists")?;
-
-    if !exists {
-        return Ok(None);
-    }
-
-    Ok(Some(
-        thread::Builder::new()
-            .name("run-vm-informant".into())
-            .spawn(move || run_informant())?,
-    ))
-}
-
-fn run_informant() -> ! {
-    let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
-
-    info!("starting VM informant");
-
-    loop {
-        let mut cmd = process::Command::new(VM_INFORMANT_PATH);
-        // Block on subprocess:
-        let result = cmd.status();
-
-        match result {
-            Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
-            Ok(status) if !status.success() => {
-                warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
-            }
-            Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
-        }
-
-        // Wait before retrying
-        thread::sleep(restart_wait);
-    }
-}
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,7 +8,6 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
-pub mod informant;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
 checkpoint_distance = '268435456' # in bytes
 checkpoint_timeout = '10m'

-gc_period = '100 s'
+gc_period = '1 hour'
 gc_horizon = '67108864'

 max_file_descriptors = '100'
@@ -101,7 +101,7 @@ away.

 #### gc_period

-Interval at which garbage collection is triggered. Default is 100 s.
+Interval at which garbage collection is triggered. Default is 1 hour.

 #### image_creation_threshold

@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.

 #### pitr_interval

-WAL retention duration for PITR branching. Default is 30 days.
+WAL retention duration for PITR branching. Default is 7 days.

 #### walreceiver_connect_timeout

--- a/docs/synthetic-size.md
+++ b/docs/synthetic-size.md
@@ -37,7 +37,7 @@ The synthetic size is designed to:

 ## Terms & assumptions

- logical size is the size of a database *at a given point in
+- logical size is the size of a branch *at a given point in
  time*. It's the total size of all tables in all databases, as you
  see with "\l+" in psql for example, plus the Postgres SLRUs and some
  small amount of metadata. NOTE that currently, Neon does not include
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -75,27 +75,36 @@ impl StartupMessageParams {
    /// taking into account all escape sequences but leaving them as-is.
    /// [`None`] means that there's no `options` in [`Self`].
    pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
-        // See `postgres: pg_split_opts`.
-        let mut last_was_escape = false;
-        let iter = self
-            .get("options")?
-            .split(move |c: char| {
-                // We split by non-escaped whitespace symbols.
-                let should_split = c.is_ascii_whitespace() && !last_was_escape;
-                last_was_escape = c == '\\' && !last_was_escape;
-                should_split
-            })
-            .filter(|s| !s.is_empty());
-
-        Some(iter)
+        self.get("options").map(Self::parse_options_raw)
    }

    /// Split command-line options according to PostgreSQL's logic,
    /// applying all escape sequences (using owned strings as needed).
    /// [`None`] means that there's no `options` in [`Self`].
    pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
+        self.get("options").map(Self::parse_options_escaped)
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// taking into account all escape sequences but leaving them as-is.
+    pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
        // See `postgres: pg_split_opts`.
-        let iter = self.options_raw()?.map(|s| {
+        let mut last_was_escape = false;
+        input
+            .split(move |c: char| {
+                // We split by non-escaped whitespace symbols.
+                let should_split = c.is_ascii_whitespace() && !last_was_escape;
+                last_was_escape = c == '\\' && !last_was_escape;
+                should_split
+            })
+            .filter(|s| !s.is_empty())
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// applying all escape sequences (using owned strings as needed).
+    pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
+        // See `postgres: pg_split_opts`.
+        Self::parse_options_raw(input).map(|s| {
            let mut preserve_next_escape = false;
            let escape = |c| {
                // We should remove '\\' unless it's preceded by '\\'.
@@ -108,9 +117,12 @@ impl StartupMessageParams {
                true => Cow::Owned(s.replace(escape, "")),
                false => Cow::Borrowed(s),
            }
-        });
+        })
+    }

-        Some(iter)
+    /// Iterate through key-value pairs in an arbitrary order.
+    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
+        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
    }

    // This function is mostly useful in tests.
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -7,5 +7,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+serde.workspace = true
+serde_json.workspace = true

 workspace_hack.workspace = true
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -0,0 +1,219 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+
+//
+//                 *-g--*---D--->
+//                /
+//               /
+//              /                 *---b----*-B--->
+//             /                 /
+//            /                 /
+//      -----*--e---*-----f----* C
+//           E                  \
+//                               \
+//                                *--a---*---A-->
+//
+// If A and B need to be retained, is it cheaper to store
+// snapshot at C+a+b, or snapshots at A and B ?
+//
+// If D also needs to be retained, which is cheaper:
+//
+// 1. E+g+e+f+a+b
+// 2. D+C+a+b
+// 3. D+A+B
+
+/// [`Segment`] which has had it's size calculated.
+#[derive(Clone, Debug)]
+struct SegmentSize {
+    method: SegmentMethod,
+
+    // calculated size of this subtree, using this method
+    accum_size: u64,
+
+    seg_id: usize,
+    children: Vec<SegmentSize>,
+}
+
+struct SizeAlternatives {
+    // cheapest alternative if parent is available.
+    incremental: SegmentSize,
+
+    // cheapest alternative if parent node is not available
+    non_incremental: Option<SegmentSize>,
+}
+
+impl StorageModel {
+    pub fn calculate(&self) -> SizeResult {
+        // Build adjacency list. 'child_list' is indexed by segment id. Each entry
+        // contains a list of all child segments of the segment.
+        let mut roots: Vec<usize> = Vec::new();
+        let mut child_list: Vec<Vec<usize>> = Vec::new();
+        child_list.resize(self.segments.len(), Vec::new());
+
+        for (seg_id, seg) in self.segments.iter().enumerate() {
+            if let Some(parent_id) = seg.parent {
+                child_list[parent_id].push(seg_id);
+            } else {
+                roots.push(seg_id);
+            }
+        }
+
+        let mut segment_results = Vec::new();
+        segment_results.resize(
+            self.segments.len(),
+            SegmentSizeResult {
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+            },
+        );
+
+        let mut total_size = 0;
+        for root in roots {
+            if let Some(selected) = self.size_here(root, &child_list).non_incremental {
+                StorageModel::fill_selected_sizes(&selected, &mut segment_results);
+                total_size += selected.accum_size;
+            } else {
+                // Couldn't find any way to get this root. Error?
+            }
+        }
+
+        SizeResult {
+            total_size,
+            segments: segment_results,
+        }
+    }
+
+    fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
+        result[selected.seg_id] = SegmentSizeResult {
+            method: selected.method,
+            accum_size: selected.accum_size,
+        };
+        // recurse to children
+        for child in selected.children.iter() {
+            StorageModel::fill_selected_sizes(child, result);
+        }
+    }
+
+    //
+    // This is the core of the sizing calculation.
+    //
+    // This is a recursive function, that for each Segment calculates the best way
+    // to reach all the Segments that are marked as needed in this subtree, under two
+    // different conditions:
+    // a) when the parent of this segment is available (as a snaphot or through WAL), and
+    // b) when the parent of this segment is not available.
+    //
+    fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
+        let seg = &self.segments[seg_id];
+        // First figure out the best way to get each child
+        let mut children = Vec::new();
+        for child_id in &child_list[seg_id] {
+            children.push(self.size_here(*child_id, child_list))
+        }
+
+        // Method 1. If this node is not needed, we can skip it as long as we
+        // take snapshots later in each sub-tree
+        let snapshot_later = if !seg.needed {
+            let mut snapshot_later = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+                children: Vec::new(),
+            };
+
+            let mut possible = true;
+            for child in children.iter() {
+                if let Some(non_incremental) = &child.non_incremental {
+                    snapshot_later.accum_size += non_incremental.accum_size;
+                    snapshot_later.children.push(non_incremental.clone())
+                } else {
+                    possible = false;
+                    break;
+                }
+            }
+            if possible {
+                Some(snapshot_later)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
+        // this Segment was given.
+        let snapshot_here = if !seg.needed || seg.parent.is_none() {
+            if let Some(snapshot_size) = seg.size {
+                let mut snapshot_here = SegmentSize {
+                    seg_id,
+                    method: SegmentMethod::SnapshotHere,
+                    accum_size: snapshot_size,
+                    children: Vec::new(),
+                };
+                for child in children.iter() {
+                    snapshot_here.accum_size += child.incremental.accum_size;
+                    snapshot_here.children.push(child.incremental.clone())
+                }
+                Some(snapshot_here)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 3. Use WAL to get here from parent
+        let wal_here = {
+            let mut wal_here = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Wal,
+                accum_size: if let Some(parent_id) = seg.parent {
+                    seg.lsn - self.segments[parent_id].lsn
+                } else {
+                    0
+                },
+                children: Vec::new(),
+            };
+            for child in children {
+                wal_here.accum_size += child.incremental.accum_size;
+                wal_here.children.push(child.incremental)
+            }
+            wal_here
+        };
+
+        // If the parent is not available, what's the cheapest method involving
+        // a snapshot here or later?
+        let mut cheapest_non_incremental: Option<SegmentSize> = None;
+        if let Some(snapshot_here) = snapshot_here {
+            cheapest_non_incremental = Some(snapshot_here);
+        }
+        if let Some(snapshot_later) = snapshot_later {
+            // Use <=, to prefer skipping if the size is equal
+            if let Some(parent) = &cheapest_non_incremental {
+                if snapshot_later.accum_size <= parent.accum_size {
+                    cheapest_non_incremental = Some(snapshot_later);
+                }
+            } else {
+                cheapest_non_incremental = Some(snapshot_later);
+            }
+        }
+
+        // And what's the cheapest method, if the parent is available?
+        let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
+        {
+            // Is it cheaper to use a snapshot here or later, anyway?
+            // Use <, to prefer Wal over snapshot if the cost is the same
+            if wal_here.accum_size < cheapest_non_incremental.accum_size {
+                wal_here
+            } else {
+                cheapest_non_incremental.clone()
+            }
+        } else {
+            wal_here
+        };
+
+        SizeAlternatives {
+            incremental: cheapest_incremental,
+            non_incremental: cheapest_non_incremental,
+        }
+    }
+}
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,401 +1,70 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
+//! Synthetic size calculation

-use anyhow::Context;
+mod calculation;
+pub mod svg;

-/// Pricing model or history size builder.
+/// StorageModel is the input to the synthetic size calculation. It represents
+/// a tree of timelines, with just the information that's needed for the
+/// calculation. This doesn't track timeline names or where each timeline
+/// begins and ends, for example. Instead, it consists of "points of interest"
+/// on the timelines. A point of interest could be the timeline start or end point,
+/// the oldest point on a timeline that needs to be retained because of PITR
+/// cutoff, or snapshot points named by the user. For each such point, and the
+/// edge connecting the points (implicit in Segment), we store information about
+/// whether we need to be able to recover to the point, and if known, the logical
+/// size at the point.
 ///
-/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
-/// type.
-pub struct Storage<K: 'static> {
-    segments: Vec<Segment>,
-
-    /// Mapping from the branch name to the index of a segment describing it's latest state.
-    branches: HashMap<K, usize>,
+/// The segments must form a well-formed tree, with no loops.
+#[derive(serde::Serialize)]
+pub struct StorageModel {
+    pub segments: Vec<Segment>,
 }

-/// Snapshot of a branch.
-#[derive(Clone, Debug, Eq, PartialEq)]
+/// Segment represents one point in the tree of branches, *and* the edge that leads
+/// to it (if any). We don't need separate structs for points and edges, because each
+/// point can have only one parent.
+///
+/// When 'needed' is true, it means that we need to be able to reconstruct
+/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
+/// a single point is needed, create two Segments with the same lsn, and mark only
+/// the child as needed.
+///
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct Segment {
    /// Previous segment index into ['Storage::segments`], if any.
-    parent: Option<usize>,
+    pub parent: Option<usize>,

-    /// Description of how did we get to this state.
-    ///
-    /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
-    /// modifying a branch directly.
-    pub op: Cow<'static, str>,
+    /// LSN at this point
+    pub lsn: u64,

-    /// LSN before this state
-    start_lsn: u64,
+    /// Logical size at this node, if known.
+    pub size: Option<u64>,

-    /// LSN at this state
-    pub end_lsn: u64,
-
-    /// Logical size before this state
-    start_size: u64,
-
-    /// Logical size at this state. Can be None in the last Segment of a branch.
-    pub end_size: Option<u64>,
-
-    /// Indices to [`Storage::segments`]
-    ///
-    /// FIXME: this could be an Option<usize>
-    children_after: Vec<usize>,
-
-    /// Determined by `retention_period` given to [`Storage::calculate`]
+    /// If true, the segment from parent to this node is needed by `retention_period`
    pub needed: bool,
 }

-//
-//
-//
-//
-//                 *-g--*---D--->
-//                /
-//               /
-//              /                 *---b----*-B--->
-//             /                 /
-//            /                 /
-//      -----*--e---*-----f----* C
-//           E                  \
-//                               \
-//                                *--a---*---A-->
-//
-// If A and B need to be retained, is it cheaper to store
-// snapshot at C+a+b, or snapshots at A and B ?
-//
-// If D also needs to be retained, which is cheaper:
-//
-// 1. E+g+e+f+a+b
-// 2. D+C+a+b
-// 3. D+A+B
+/// Result of synthetic size calculation. Returned by StorageModel::calculate()
+pub struct SizeResult {
+    pub total_size: u64,

-/// [`Segment`] which has had it's size calculated.
-pub struct SegmentSize {
-    pub seg_id: usize,
-
-    pub method: SegmentMethod,
-
-    this_size: u64,
-
-    pub children: Vec<SegmentSize>,
+    // This has same length as the StorageModel::segments vector in the input.
+    // Each entry in this array corresponds to the entry with same index in
+    // StorageModel::segments.
+    pub segments: Vec<SegmentSizeResult>,
 }

-impl SegmentSize {
-    fn total(&self) -> u64 {
-        self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-    }
-
-    pub fn total_children(&self) -> u64 {
-        if self.method == SnapshotAfter {
-            self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-        } else {
-            self.children.iter().fold(0, |acc, x| acc + x.total())
-        }
-    }
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
+pub struct SegmentSizeResult {
+    pub method: SegmentMethod,
+    // calculated size of this subtree, using this method
+    pub accum_size: u64,
 }

 /// Different methods to retain history from a particular state
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub enum SegmentMethod {
-    SnapshotAfter,
-    Wal,
-    WalNeeded,
+    SnapshotHere, // A logical snapshot is needed after this segment
+    Wal,          // Keep WAL leading up to this node
    Skipped,
 }
-
-use SegmentMethod::*;
-
-impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
-    /// Creates a new storage with the given default branch name.
-    pub fn new(initial_branch: K) -> Storage<K> {
-        let init_segment = Segment {
-            op: "".into(),
-            needed: false,
-            parent: None,
-            start_lsn: 0,
-            end_lsn: 0,
-            start_size: 0,
-            end_size: Some(0),
-            children_after: Vec::new(),
-        };
-
-        Storage {
-            segments: vec![init_segment],
-            branches: HashMap::from([(initial_branch, 0)]),
-        }
-    }
-
-    /// Advances the branch with a new point, at given LSN.
-    pub fn insert_point<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn: u64,
-        size: Option<u64>,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        assert!(lsn > lastseg.end_lsn);
-
-        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lsn,
-            start_size,
-            end_size: size,
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-
-        Ok(())
-    }
-
-    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
-    pub fn modify_branch<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn_bytes: u64,
-        size_bytes: i64,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: last_end_size,
-            end_size: Some((last_end_size as i64 + size_bytes) as u64),
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-        Ok(())
-    }
-
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
-    }
-
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "update".into(), bytes, 0i64)
-    }
-
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
-    }
-
-    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q> + std::fmt::Debug,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        // Find the right segment
-        let branchseg_id = *self.branches.get(parent).with_context(|| {
-            format!(
-                "should had found the parent {:?} by key. in branches {:?}",
-                parent, self.branches
-            )
-        })?;
-
-        let _branchseg = &mut self.segments[branchseg_id];
-
-        // Create branch name for it
-        self.branches.insert(name, branchseg_id);
-        Ok(())
-    }
-
-    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
-        // Phase 1: Mark all the segments that need to be retained
-        for (_branch, &last_seg_id) in self.branches.iter() {
-            let last_seg = &self.segments[last_seg_id];
-            let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
-            let mut seg_id = last_seg_id;
-            loop {
-                let seg = &mut self.segments[seg_id];
-                if seg.end_lsn < cutoff_lsn {
-                    break;
-                }
-                seg.needed = true;
-                if let Some(prev_seg_id) = seg.parent {
-                    seg_id = prev_seg_id;
-                } else {
-                    break;
-                }
-            }
-        }
-
-        // Phase 2: For each oldest segment in a chain that needs to be retained,
-        // calculate if we should store snapshot or WAL
-        self.size_from_snapshot_later(0)
-    }
-
-    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        let seg = &self.segments[seg_id];
-
-        let this_size = seg.end_lsn - seg.start_lsn;
-
-        let mut children = Vec::new();
-
-        // try both ways
-        for &child_id in seg.children_after.iter() {
-            // try each child both ways
-            let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id)?;
-
-            let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id)?;
-                if p1.total() < p2.total() {
-                    p1
-                } else {
-                    p2
-                }
-            } else {
-                p1
-            };
-            children.push(p);
-        }
-        Ok(SegmentSize {
-            seg_id,
-            method: if seg.needed { WalNeeded } else { Wal },
-            this_size,
-            children,
-        })
-    }
-
-    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        // If this is needed, then it's time to do the snapshot and continue
-        // with wal method.
-        let seg = &self.segments[seg_id];
-        //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
-        if seg.needed {
-            let mut children = Vec::new();
-
-            for &child_id in seg.children_after.iter() {
-                // try each child both ways
-                let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id)?;
-
-                let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id)?;
-                    if p1.total() < p2.total() {
-                        p1
-                    } else {
-                        p2
-                    }
-                } else {
-                    p1
-                };
-                children.push(p);
-            }
-            Ok(SegmentSize {
-                seg_id,
-                method: WalNeeded,
-                this_size: seg.start_size,
-                children,
-            })
-        } else {
-            // If any of the direct children are "needed", need to be able to reconstruct here
-            let mut children_needed = false;
-            for &child in seg.children_after.iter() {
-                let seg = &self.segments[child];
-                if seg.needed {
-                    children_needed = true;
-                    break;
-                }
-            }
-
-            let method1 = if !children_needed {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child)?);
-                }
-                Some(SegmentSize {
-                    seg_id,
-                    method: Skipped,
-                    this_size: 0,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            // If this a junction, consider snapshotting here
-            let method2 = if children_needed || seg.children_after.len() >= 2 {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child)?);
-                }
-                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
-                Some(SegmentSize {
-                    seg_id,
-                    method: SnapshotAfter,
-                    this_size,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            Ok(match (method1, method2) {
-                (None, None) => anyhow::bail!(
-                    "neither method was applicable: children_after={}, children_needed={}",
-                    seg.children_after.len(),
-                    children_needed
-                ),
-                (Some(method), None) => method,
-                (None, Some(method)) => method,
-                (Some(method1), Some(method2)) => {
-                    if method1.total() < method2.total() {
-                        method1
-                    } else {
-                        method2
-                    }
-                }
-            })
-        }
-    }
-
-    pub fn into_segments(self) -> Vec<Segment> {
-        self.segments
-    }
-}
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -1,269 +0,0 @@
-//! Tenant size model testing ground.
-//!
-//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
-//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
-//! into pngs.
-
-use tenant_size_model::{Segment, SegmentSize, Storage};
-
-// Main branch only. Some updates on it.
-fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Main branch only. Some updates on it.
-fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    storage.update("main", 1_000)?;
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Like 2, but more updates on main
-fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Diverged branches
-fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..8 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    let mut storage = Storage::new("a");
-    storage.insert("a", 5000)?;
-    storage.branch("a", "b")?;
-    storage.update("b", 4000)?;
-    storage.update("a", 2000)?;
-    storage.branch("a", "c")?;
-    storage.insert("c", 4000)?;
-    storage.insert("a", 2000)?;
-
-    let size = storage.calculate(5000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    use std::borrow::Cow;
-
-    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
-
-    let branches = [
-        Some(0x7ff1edab8182025f15ae33482edb590a_u128),
-        Some(0xb1719e044db05401a05a2ed588a3ad3f),
-        Some(0xb68d6691c895ad0a70809470020929ef),
-    ];
-
-    // compared to other scenarios, this one uses bytes instead of kB
-
-    let mut storage = Storage::new(None);
-
-    storage.branch(&None, branches[0])?; // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
-    storage.branch(&branches[0], branches[1])?; // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
-    storage.branch(&branches[0], branches[2])?; // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
-
-    let size = storage.calculate(100_000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn main() {
-    let args: Vec<String> = std::env::args().collect();
-
-    let scenario = if args.len() < 2 { "1" } else { &args[1] };
-
-    let (segments, size) = match scenario {
-        "1" => scenario_1(),
-        "2" => scenario_2(),
-        "3" => scenario_3(),
-        "4" => scenario_4(),
-        "5" => scenario_5(),
-        "6" => scenario_6(),
-        other => {
-            eprintln!("invalid scenario {}", other);
-            std::process::exit(1);
-        }
-    }
-    .unwrap();
-
-    graphviz_tree(&segments, &size);
-}
-
-fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
-    use tenant_size_model::SegmentMethod::*;
-
-    let seg_id = node.seg_id;
-    let seg = segments.get(seg_id).unwrap();
-    let lsn = seg.end_lsn;
-    let size = seg.end_size.unwrap_or(0);
-    let method = node.method;
-
-    println!("  {{");
-    println!("    node [width=0.1 height=0.1 shape=oval]");
-
-    let tenant_size = node.total_children();
-
-    let penwidth = if seg.needed { 6 } else { 3 };
-    let x = match method {
-        SnapshotAfter =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
-        Wal =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        WalNeeded =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        Skipped =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
-    };
-
-    println!("    \"seg{seg_id}\" [{x}]");
-    println!("  }}");
-
-    // Recurse. Much of the data is actually on the edge
-    for child in node.children.iter() {
-        let child_id = child.seg_id;
-        graphviz_recurse(segments, child);
-
-        let edge_color = match child.method {
-            SnapshotAfter => "gray",
-            Wal => "black",
-            WalNeeded => "black",
-            Skipped => "gray",
-        };
-
-        println!("  {{");
-        println!("    edge [] ");
-        print!("    \"seg{seg_id}\" -> \"seg{child_id}\" [");
-        print!("color={edge_color}");
-        if child.method == WalNeeded {
-            print!(" penwidth=6");
-        }
-        if child.method == Wal {
-            print!(" penwidth=3");
-        }
-
-        let next = segments.get(child_id).unwrap();
-
-        if next.op.is_empty() {
-            print!(
-                " label=\"{} / {}\"",
-                next.end_lsn - seg.end_lsn,
-                (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
-            );
-        } else {
-            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
-        }
-        println!("]");
-        println!("  }}");
-    }
-}
-
-fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
-    println!("digraph G {{");
-    println!("  fontname=\"Helvetica,Arial,sans-serif\"");
-    println!("  node [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  edge [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  graph [center=1 rankdir=LR]");
-    println!("  edge [dir=none]");
-
-    graphviz_recurse(segments, tree);
-
-    println!("}}");
-}
-
-#[test]
-fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
-    let truths: &[(u32, ScenarioFn, _)] = &[
-        (line!(), scenario_1, 8000),
-        (line!(), scenario_2, 9000),
-        (line!(), scenario_3, 13000),
-        (line!(), scenario_4, 16000),
-        (line!(), scenario_5, 17000),
-        (line!(), scenario_6, 333_792_000),
-    ];
-
-    for (line, scenario, expected) in truths {
-        let (_, size) = scenario().unwrap();
-        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
-    }
-}
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -0,0 +1,193 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+use std::fmt::Write;
+
+const SVG_WIDTH: f32 = 500.0;
+
+struct SvgDraw<'a> {
+    storage: &'a StorageModel,
+    branches: &'a [String],
+    seg_to_branch: &'a [usize],
+    sizes: &'a [SegmentSizeResult],
+
+    // layout
+    xscale: f32,
+    min_lsn: u64,
+    seg_coordinates: Vec<(f32, f32)>,
+}
+
+fn draw_legend(result: &mut String) -> anyhow::Result<()> {
+    writeln!(
+        result,
+        "<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"35\">WAL within retention period</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    Ok(())
+}
+
+pub fn draw_svg(
+    storage: &StorageModel,
+    branches: &[String],
+    seg_to_branch: &[usize],
+    sizes: &SizeResult,
+) -> anyhow::Result<String> {
+    let mut draw = SvgDraw {
+        storage,
+        branches,
+        seg_to_branch,
+        sizes: &sizes.segments,
+
+        xscale: 0.0,
+        min_lsn: 0,
+        seg_coordinates: Vec::new(),
+    };
+
+    let mut result = String::new();
+
+    writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
+
+    draw.calculate_svg_layout();
+
+    // Draw the tree
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase1(seg_id, &mut result)?;
+    }
+
+    // Draw snapshots
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase2(seg_id, &mut result)?;
+    }
+
+    draw_legend(&mut result)?;
+
+    write!(result, "</svg>")?;
+
+    Ok(result)
+}
+
+impl<'a> SvgDraw<'a> {
+    fn calculate_svg_layout(&mut self) {
+        // Find x scale
+        let segments = &self.storage.segments;
+        let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
+        let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
+
+        // Start with 1 pixel = 1 byte. Double the scale until it fits into the image
+        let mut xscale = 1.0;
+        while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
+            xscale *= 2.0;
+        }
+
+        // Layout the timelines on Y dimension.
+        // TODO
+        let mut y = 100.0;
+        let mut branch_y_coordinates = Vec::new();
+        for _branch in self.branches {
+            branch_y_coordinates.push(y);
+            y += 40.0;
+        }
+
+        // Calculate coordinates for each point
+        let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
+            .map(|(seg, branch_id)| {
+                let x = (seg.lsn - min_lsn) as f32 / xscale;
+                let y = branch_y_coordinates[*branch_id];
+                (x, y)
+            })
+            .collect();
+
+        self.xscale = xscale;
+        self.min_lsn = min_lsn;
+        self.seg_coordinates = seg_coordinates;
+    }
+
+    /// Draws lines between points
+    fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        let wal_bytes = if let Some(parent_id) = seg.parent {
+            seg.lsn - self.storage.segments[parent_id].lsn
+        } else {
+            0
+        };
+
+        let style = match self.sizes[seg_id].method {
+            SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
+            SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
+                "stroke-width=\"6\" stroke=\"black\""
+            }
+            SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
+            SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
+        };
+        if let Some(parent_id) = seg.parent {
+            let (x1, y1) = self.seg_coordinates[parent_id];
+            let (x2, y2) = self.seg_coordinates[seg_id];
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(
+                result,
+                "  <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
+            )?;
+            writeln!(result, "</line>")?;
+        } else {
+            // draw a little dash to mark the starting point of this branch
+            let (x, y) = self.seg_coordinates[seg_id];
+            let (x1, y1) = (x, y - 5.0);
+            let (x2, y2) = (x, y + 5.0);
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>(seg {seg_id})</title>")?;
+            writeln!(result, "</line>")?;
+        }
+
+        Ok(())
+    }
+
+    /// Draw circles where snapshots are taken
+    fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        // draw a snapshot point if it's needed
+        let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+        if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
+            writeln!(
+                result,
+                "<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
+            )?;
+            writeln!(
+                result,
+                "  <title>logical size {}</title>",
+                seg.size.unwrap()
+            )?;
+            write!(result, "</circle>")?;
+        }
+
+        Ok(())
+    }
+}
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -0,0 +1,313 @@
+//! Tenant size model tests.
+
+use tenant_size_model::{Segment, SizeResult, StorageModel};
+
+use std::collections::HashMap;
+
+struct ScenarioBuilder {
+    segments: Vec<Segment>,
+
+    /// Mapping from the branch name to the index of a segment describing its latest state.
+    branches: HashMap<String, usize>,
+}
+
+impl ScenarioBuilder {
+    /// Creates a new storage with the given default branch name.
+    pub fn new(initial_branch: &str) -> ScenarioBuilder {
+        let init_segment = Segment {
+            parent: None,
+            lsn: 0,
+            size: Some(0),
+            needed: false, // determined later
+        };
+
+        ScenarioBuilder {
+            segments: vec![init_segment],
+            branches: HashMap::from([(initial_branch.into(), 0)]),
+        }
+    }
+
+    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
+    pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        let newseg = Segment {
+            parent: Some(lastseg_id),
+            lsn: lastseg.lsn + lsn_bytes,
+            size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
+            needed: false,
+        };
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
+    pub fn insert(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, bytes as i64);
+    }
+
+    pub fn update(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, 0i64);
+    }
+
+    pub fn _delete(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, -(bytes as i64));
+    }
+
+    /// Panics if the parent branch cannot be found.
+    pub fn branch(&mut self, parent: &str, name: &str) {
+        // Find the right segment
+        let branchseg_id = *self
+            .branches
+            .get(parent)
+            .expect("should had found the parent by key");
+        let _branchseg = &mut self.segments[branchseg_id];
+
+        // Create branch name for it
+        self.branches.insert(name.to_string(), branchseg_id);
+    }
+
+    pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
+        // Phase 1: Mark all the segments that need to be retained
+        for (_branch, &last_seg_id) in self.branches.iter() {
+            let last_seg = &self.segments[last_seg_id];
+            let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
+            let mut seg_id = last_seg_id;
+            loop {
+                let seg = &mut self.segments[seg_id];
+                if seg.lsn <= cutoff_lsn {
+                    break;
+                }
+                seg.needed = true;
+                if let Some(prev_seg_id) = seg.parent {
+                    seg_id = prev_seg_id;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Perform the calculation
+        let storage_model = StorageModel {
+            segments: self.segments.clone(),
+        };
+        let size_result = storage_model.calculate();
+        (storage_model, size_result)
+    }
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_1() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Calculate the synthetic size with retention horizon 1000
+    let (_model, result) = scenario.calculate(1000);
+
+    // The end of the branch is at LSN 10000. Need to retain
+    // a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
+    // The logical snapshot has size 5000.
+    assert_eq!(result.total_size, 5000 + 1000);
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_2() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    scenario.update("main", 1_000);
+
+    //
+    // The history looks like this now:
+    //
+    //         10000          11000
+    // *----*----*--------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, we need to retain logical snapshot
+    // at the branch point, size 5000, and the WAL from 10000-11000 on
+    // both branches.
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 1000);
+}
+
+// Like 2, but more updates on main
+#[test]
+fn scenario_3() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 15000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's still cheapest to retain
+    // - snapshot at branch point (size 5000)
+    // - WAL on child between 10000-11000
+    // - WAL on main between 10000-15000
+    //
+    // This is in total 5000 + 1000 + 5000
+    //
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000);
+}
+
+// Diverged branches
+#[test]
+fn scenario_4() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..8 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 18000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's now cheapest to retain
+    // separate snapshots on both branches:
+    // - snapshot on main branch at LSN 17000 (size 5000)
+    // - WAL on main between 17000-18000
+    // - snapshot on child branch at LSN 10000 (size 5000)
+    // - WAL on child between 10000-11000
+    //
+    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
+    //
+    // (If we used the the method from the previous scenario, and
+    // kept only snapshot at the branch point, we'd need to keep
+    // all the WAL between 10000-18000 on the main branch, so
+    // the total size would be 5000 + 1000 + 8000 = 14000. The
+    // calculation always picks the cheapest alternative)
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
+}
+
+#[test]
+fn scenario_5() {
+    let mut scenario = ScenarioBuilder::new("a");
+    scenario.insert("a", 5000);
+    scenario.branch("a", "b");
+    scenario.update("b", 4000);
+    scenario.update("a", 2000);
+    scenario.branch("a", "c");
+    scenario.insert("c", 4000);
+    scenario.insert("a", 2000);
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 17000);
+}
+
+#[test]
+fn scenario_6() {
+    let branches = [
+        "7ff1edab8182025f15ae33482edb590a",
+        "b1719e044db05401a05a2ed588a3ad3f",
+        "0xb68d6691c895ad0a70809470020929ef",
+    ];
+
+    // compared to other scenarios, this one uses bytes instead of kB
+
+    let mut scenario = ScenarioBuilder::new("");
+
+    scenario.branch("", branches[0]); // at 0
+    scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
+    scenario.branch(branches[0], branches[1]); // at 108951064
+    scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
+    scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
+    scenario.branch(branches[0], branches[2]); // at 283415424
+    scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
+    scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
+
+    let (model, result) = scenario.calculate(100_000);
+
+    // FIXME: We previously calculated 333_792_000. But with this PR, we get
+    // a much lower number. At a quick look at the model output and the
+    // calculations here, the new result seems correct to me.
+    eprintln!(
+        " MODEL: {}",
+        serde_json::to_string(&model.segments).unwrap()
+    );
+    eprintln!(
+        "RESULT: {}",
+        serde_json::to_string(&result.segments).unwrap()
+    );
+
+    assert_eq!(result.total_size, 136_236_928);
+}
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {

    Ok(())
 }
+
+/// Disable the default rust panic hook by using `set_hook`.
+///
+/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
+/// that sentry is configured (if needed). sentry will install it's own on top of this, always
+/// processing the panic before we log it.
+///
+/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
+/// If the assumptions about the initialization order are not held, use
+/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
+/// lost.
+#[must_use]
+pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
+    std::panic::set_hook(Box::new(tracing_panic_hook));
+    TracingPanicHookGuard::new()
+}
+
+/// Drop guard which restores the std panic hook on drop.
+///
+/// Tracing should not be used when it's not configured, but we cannot really latch on to any
+/// imaginary lifetime of tracing.
+pub struct TracingPanicHookGuard {
+    act: bool,
+}
+
+impl TracingPanicHookGuard {
+    fn new() -> Self {
+        TracingPanicHookGuard { act: true }
+    }
+
+    /// Make this hook guard not do anything when dropped.
+    pub fn forget(&mut self) {
+        self.act = false;
+    }
+}
+
+impl Drop for TracingPanicHookGuard {
+    fn drop(&mut self) {
+        if self.act {
+            let _ = std::panic::take_hook();
+        }
+    }
+}
+
+/// Named symbol for our panic hook, which logs the panic.
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+    // following rust 1.66.1 std implementation:
+    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
+    let location = info.location();
+
+    let msg = match info.payload().downcast_ref::<&'static str>() {
+        Some(s) => *s,
+        None => match info.payload().downcast_ref::<String>() {
+            Some(s) => &s[..],
+            None => "Box<dyn Any>",
+        },
+    };
+
+    let thread = std::thread::current();
+    let thread = thread.name().unwrap_or("<unnamed>");
+    let backtrace = std::backtrace::Backtrace::capture();
+
+    let _entered = if let Some(location) = location {
+        tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
+    } else {
+        // very unlikely to hit here, but the guarantees of std could change
+        tracing::error_span!("panic", %thread)
+    }
+    .entered();
+
+    if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+        // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
+        // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
+        // string, maybe even to a TLS one but tracing already does that.
+        tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
+    } else {
+        tracing::error!("{msg}");
+    }
+
+    // ensure that we log something on the panic if this hook is left after tracing has been
+    // unconfigured. worst case when teardown is racing the panic is to log the panic twice.
+    tracing::dispatcher::get_default(|d| {
+        if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
+            let location = location.map(PrettyLocation);
+            log_panic_to_stderr(thread, msg, location, &backtrace);
+        }
+    });
+}
+
+#[cold]
+fn log_panic_to_stderr(
+    thread: &str,
+    msg: &str,
+    location: Option<PrettyLocation<'_, '_>>,
+    backtrace: &std::backtrace::Backtrace,
+) {
+    eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
+}
+
+struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
+
+impl std::fmt::Display for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
+    }
+}
+
+impl std::fmt::Debug for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        <Self as std::fmt::Display>::fmt(self, f)
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> {
        }
    };

+    // Initialize logging, which must be initialized before the custom panic hook is installed.
+    logging::init(conf.log_format)?;
+
+    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
+    // disarming this hook on pageserver, because we never tear down tracing.
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
    // initialize sentry if SENTRY_DSN is provided
    let _sentry_guard = init_sentry(
        Some(GIT_VERSION.into()),
@@ -210,9 +217,6 @@ fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
-    // Initialize logging
-    logging::init(conf.log_format)?;
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";

 #[serde_as]
-#[derive(Serialize)]
+#[derive(Serialize, Debug)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
    // define client here to reuse it for all requests
    let client = reqwest::Client::new();
    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
-    let mut prev_iteration_time: Option<std::time::Instant> = None;
+    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
        tokio::select! {
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
            _ = ticker.tick() => {

                // send cached metrics every cached_metric_collection_interval
-                let send_cached = prev_iteration_time
-                .map(|x| x.elapsed() >= cached_metric_collection_interval)
-                .unwrap_or(false);
+                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;

-                prev_iteration_time = Some(std::time::Instant::now());
+                if send_cached {
+                    prev_iteration_time = std::time::Instant::now();
+                }

                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
            }
@@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration(
                    }
                } else {
                    error!("metrics endpoint refused the sent metrics: {:?}", res);
+                    for metric in chunk_to_send.iter() {
+                        // Report if the metric value is suspiciously large
+                        if metric.value > (1u64 << 40) {
+                            error!("potentially abnormal metric value: {:?}", metric);
+                        }
+                    }
                }
            }
            Err(err) => {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -437,6 +437,13 @@ paths:
          type: boolean
        description: |
          When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
+      - name: retention_period
+        in: query
+        required: false
+        schema:
+          type: integer
+        description: |
+          Override the default retention period (in bytes) used for size calculation.
    get:
      description: |
        Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,19 +7,21 @@ use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use remote_storage::GenericRemoteStorage;
+use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use super::models::{
    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::mgr::TenantMapInsertError;
+use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -38,7 +40,7 @@ use utils::{

 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
-use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
+use super::models::ConfigureFailpointsRequest;

 struct State {
    conf: &'static PageServerConf,
@@ -479,11 +481,19 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// to debug any of the calculations. Requires `tenant_id` request parameter, supports
 /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
 /// values.
+///
+/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
+/// (only if it is shorter than the real cutoff).
+///
+/// Note: we don't update the cached size and prometheus metric here.
+/// The retention period might be different, and it's nice to have a method to just calculate it
+/// without modifying anything anyway.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
-
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
+    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
+    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let tenant = mgr::get_tenant(tenant_id, true)
@@ -492,24 +502,29 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A

    // this can be long operation
    let inputs = tenant
-        .gather_size_inputs(&ctx)
+        .gather_size_inputs(retention_period, &ctx)
        .await
        .map_err(ApiError::InternalServerError)?;

-    let size = if !inputs_only.unwrap_or(false) {
-        Some(
-            tenant
-                .calc_and_update_cached_synthetic_size(&inputs)
-                .map_err(ApiError::InternalServerError)?,
-        )
-    } else {
-        None
-    };
+    let mut sizes = None;
+    if !inputs_only.unwrap_or(false) {
+        let storage_model = inputs
+            .calculate_model()
+            .map_err(ApiError::InternalServerError)?;
+        let size = storage_model.calculate();

-    /// Private response type with the additional "unstable" `inputs` field.
-    ///
-    /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
-    /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
+        // If request header expects html, return html
+        if headers["Accept"] == "text/html" {
+            return synthetic_size_html_response(inputs, storage_model, size);
+        }
+        sizes = Some(size);
+    } else if headers["Accept"] == "text/html" {
+        return Err(ApiError::BadRequest(anyhow!(
+            "inputs_only parameter is incompatible with html output request"
+        )));
+    }
+
+    /// The type resides in the pageserver not to expose `ModelInputs`.
    #[serde_with::serde_as]
    #[derive(serde::Serialize)]
    struct TenantHistorySize {
@@ -519,6 +534,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
        ///
        /// Will be none if `?inputs_only=true` was given.
        size: Option<u64>,
+        /// Size of each segment used in the model.
+        /// Will be null if `?inputs_only=true` was given.
+        segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
        inputs: crate::tenant::size::ModelInputs,
    }

@@ -526,7 +544,8 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
        StatusCode::OK,
        TenantHistorySize {
            id: tenant_id,
-            size,
+            size: sizes.as_ref().map(|x| x.total_size),
+            segment_sizes: sizes.map(|x| x.segments),
            inputs,
        },
    )
@@ -591,6 +610,62 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
    }
 }

+/// Get tenant_size SVG graph along with the JSON data.
+fn synthetic_size_html_response(
+    inputs: ModelInputs,
+    storage_model: StorageModel,
+    sizes: SizeResult,
+) -> Result<Response<Body>, ApiError> {
+    let mut timeline_ids: Vec<String> = Vec::new();
+    let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
+    for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
+        timeline_map.insert(ti.timeline_id, index);
+        timeline_ids.push(ti.timeline_id.to_string());
+    }
+    let seg_to_branch: Vec<usize> = inputs
+        .segments
+        .iter()
+        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .collect();
+
+    let svg =
+        tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
+            .map_err(ApiError::InternalServerError)?;
+
+    let mut response = String::new();
+
+    use std::fmt::Write;
+    write!(response, "<html>\n<body>\n").unwrap();
+    write!(response, "<div>\n{svg}\n</div>").unwrap();
+    writeln!(response, "Project size: {}", sizes.total_size).unwrap();
+    writeln!(response, "<pre>").unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&inputs).unwrap()
+    )
+    .unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&sizes.segments).unwrap()
+    )
+    .unwrap();
+    writeln!(response, "</pre>").unwrap();
+    write!(response, "</body>\n</html>\n").unwrap();
+
+    html_response(StatusCode::OK, response)
+}
+
+pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
+    let response = Response::builder()
+        .status(status)
+        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .body(Body::from(data.as_bytes().to_vec()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+    Ok(response)
+}
+
 // Helper function to standardize the error messages we produce on bad durations
 //
 // Intended to be used with anyhow's `with_context`, e.g.:
@@ -850,7 +925,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 // Run GC immediately on given timeline.
-#[cfg(feature = "testing")]
 async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -954,6 +1028,17 @@ async fn active_timeline_of_active_tenant(
        .map_err(ApiError::NotFound)
 }

+async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
+    // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
+    // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
+    let query = req.uri().query();
+    let _ = std::panic::catch_unwind(|| {
+        panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
+    });
+    json_response(StatusCode::NO_CONTENT, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1019,7 +1104,7 @@ pub fn make_router(
        .get("/v1/tenant", tenant_list_handler)
        .post("/v1/tenant", tenant_create_handler)
        .get("/v1/tenant/:tenant_id", tenant_status)
-        .get("/v1/tenant/:tenant_id/size", tenant_size_handler)
+        .get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
        .put("/v1/tenant/config", update_tenant_config_handler)
        .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
        .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
@@ -1038,7 +1123,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
-            testing_api!("run timeline GC", timeline_gc_handler),
+            timeline_gc_handler,
        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
@@ -1072,5 +1157,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
            evict_timeline_layer_handler,
        )
+        .get("/v1/panic", always_panic_handler)
        .any(handler_404))
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2418,6 +2418,9 @@ impl Tenant {
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
    pub async fn gather_size_inputs(
        &self,
+        // `max_retention_period` overrides the cutoff that is used to calculate the size
+        // (only if it is shorter than the real cutoff).
+        max_retention_period: Option<u64>,
        ctx: &RequestContext,
    ) -> anyhow::Result<size::ModelInputs> {
        let logical_sizes_at_once = self
@@ -2425,32 +2428,41 @@ impl Tenant {
            .concurrent_tenant_size_logical_size_queries
            .inner();

-        // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
-        // are for testing/experimenting, we tolerate this.
+        // TODO: Having a single mutex block concurrent reads is not great for performance.
+        //
+        // But the only case where we need to run multiple of these at once is when we
+        // request a size for a tenant manually via API, while another background calculation
+        // is in progress (which is not a common case).
        //
        // See more for on the issue #2748 condenced out of the initial PR review.
        let mut shared_cache = self.cached_logical_sizes.lock().await;

-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
+        size::gather_inputs(
+            self,
+            logical_sizes_at_once,
+            max_retention_period,
+            &mut shared_cache,
+            ctx,
+        )
+        .await
    }

-    /// Calculate synthetic tenant size
+    /// Calculate synthetic tenant size and cache the result.
    /// This is periodically called by background worker.
    /// result is cached in tenant struct
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(ctx).await?;
+        let inputs = self.gather_size_inputs(None, ctx).await?;

-        self.calc_and_update_cached_synthetic_size(&inputs)
-    }
-
-    /// Calculate synthetic size , cache it and set metric value
-    pub fn calc_and_update_cached_synthetic_size(
-        &self,
-        inputs: &size::ModelInputs,
-    ) -> anyhow::Result<u64> {
        let size = inputs.calculate()?;

+        self.set_cached_synthetic_size(size);
+
+        Ok(size)
+    }
+
+    /// Cache given synthetic size and update the metric value
+    pub fn set_cached_synthetic_size(&self, size: u64) {
        self.cached_synthetic_tenant_size
            .store(size, Ordering::Relaxed);

@@ -2458,8 +2470,6 @@ impl Tenant {
            .get_metric_with_label_values(&[&self.tenant_id.to_string()])
            .unwrap()
            .set(size);
-
-        Ok(size)
    }

    pub fn get_cached_synthetic_size(&self) -> u64 {
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -154,6 +154,12 @@ where
        expected: &Arc<L>,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
+        fail::fail_point!("layermap-replace-notfound", |_| Ok(
+            // this is not what happens if an L0 layer was not found a anyhow error but perhaps
+            // that should be changed. this is good enough to show a replacement failure.
+            Replacement::NotFound
+        ));
+
        self.layer_map.replace_historic_noflush(expected, new)
    }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -540,13 +540,11 @@ where
    }
 }

-#[cfg(feature = "testing")]
 use {
    crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
    utils::http::error::ApiError,
 };

-#[cfg(feature = "testing")]
 pub async fn immediate_gc(
    tenant_id: TenantId,
    timeline_id: TimelineId,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -571,14 +571,15 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    ///
    /// Launch a delete operation in the background.
    ///
+    /// The operation does not modify local state but assumes the local files have already been
+    /// deleted, and is used to mirror those changes to remote.
+    ///
    /// Note: This schedules an index file upload before the deletions.  The
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
    /// succesfully.
-    ///
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -49,6 +49,17 @@ pub struct RemoteLayer {
    access_stats: LayerAccessStats,

    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
 }

 impl std::fmt::Debug for RemoteLayer {
@@ -207,6 +218,7 @@ impl RemoteLayer {
            file_name: fname.to_owned().into(),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
            access_stats,
        }
    }
@@ -228,6 +240,7 @@ impl RemoteLayer {
            file_name: fname.to_owned().into(),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
            access_stats,
        }
    }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -3,7 +3,7 @@

 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
@@ -11,6 +11,7 @@ use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantId;

@@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) {
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
+        let cancel = task_mgr::shutdown_token();
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
+        let mut first = true;
        loop {
            trace!("waking up");

            let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                    info!("received cancellation request");
-                return;
+                    return;
                },
                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                    ControlFlow::Break(()) => return,
                    ControlFlow::Continue(tenant) => tenant,
                },
-        };
+            };

-            let mut sleep_duration = tenant.get_compaction_period();
-            if sleep_duration == Duration::ZERO {
-                info!("automatic compaction is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
-            } else {
-                // Run compaction
-                if let Err(e) = tenant.compaction_iteration(&ctx).await {
-                    sleep_duration = wait_duration;
-                    error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
+            let period = tenant.get_compaction_period();
+
+            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
+            // loop
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
                }
            }

+            let started_at = Instant::now();
+
+            let sleep_duration = if period == Duration::ZERO {
+                info!("automatic compaction is disabled");
+                // check again in 10 seconds, in case it's been enabled again.
+                Duration::from_secs(10)
+            } else {
+                // Run compaction
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
+                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
+                }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
+
            // Sleep
            tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                    info!("received cancellation request during idling");
                    break;
                },
@@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) {
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
+        let cancel = task_mgr::shutdown_token();
        // GC might require downloading, to find the cutoff LSN that corresponds to the
        // cutoff specified as time.
        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let mut first = true;
        loop {
            trace!("waking up");

            let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                    info!("received cancellation request");
                    return;
                },
@@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) {
                },
            };

-            let gc_period = tenant.get_gc_period();
-            let gc_horizon = tenant.get_gc_horizon();
-            let mut sleep_duration = gc_period;
-            if sleep_duration == Duration::ZERO {
-                info!("automatic GC is disabled");
-                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
-            } else {
-                // Run gc
-                if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
-                    {
-                        sleep_duration = wait_duration;
-                        error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
-                    }
+            let period = tenant.get_gc_period();
+
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
                }
            }

+            let started_at = Instant::now();
+
+            let gc_horizon = tenant.get_gc_horizon();
+            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
+                info!("automatic GC is disabled");
+                // check again in 10 seconds, in case it's been enabled again.
+                Duration::from_secs(10)
+            } else {
+                // Run gc
+                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                if let Err(e) = res {
+                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
+                }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "gc");
+
            // Sleep
            tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                    info!("received cancellation request during idling");
                    break;
                },
@@ -197,3 +229,49 @@ async fn wait_for_active_tenant(
        }
    }
 }
+
+#[derive(thiserror::Error, Debug)]
+#[error("cancelled")]
+pub(crate) struct Cancelled;
+
+/// Provide a random delay for background task initialization.
+///
+/// This delay prevents a thundering herd of background tasks and will likely keep them running on
+/// different periods for more stable load.
+pub(crate) async fn random_init_delay(
+    period: Duration,
+    cancel: &CancellationToken,
+) -> Result<(), Cancelled> {
+    use rand::Rng;
+
+    let d = {
+        let mut rng = rand::thread_rng();
+
+        // gen_range asserts that the range cannot be empty, which it could be because period can
+        // be set to zero to disable gc or compaction, so lets set it to be at least 10s.
+        let period = std::cmp::max(period, Duration::from_secs(10));
+
+        // semi-ok default as the source of jitter
+        rng.gen_range(Duration::ZERO..=period)
+    };
+
+    tokio::select! {
+        _ = cancel.cancelled() => Err(Cancelled),
+        _ = tokio::time::sleep(d) => Ok(()),
+    }
+}
+
+pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
+    if elapsed >= period && period != Duration::ZERO {
+        // humantime does no significant digits clamping whereas Duration's debug is a bit more
+        // intelligent. however it makes sense to keep the "configuration format" for period, even
+        // though there's no way to output the actual config value.
+        warn!(
+            ?elapsed,
+            period = %humantime::format_duration(period),
+            task,
+            "task iteration took longer than the configured period"
+        );
+    }
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -613,7 +613,10 @@ impl Timeline {
        self.flush_frozen_layers_and_wait().await
    }

+    /// Outermost timeline compaction operation; downloads needed layers.
    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        const ROUNDS: usize = 2;
+
        let last_record_lsn = self.get_last_record_lsn();

        // Last record Lsn could be zero in case the timeline was just created
@@ -622,6 +625,86 @@ impl Timeline {
            return Ok(());
        }

+        // retry two times to allow first round to find layers which need to be downloaded, then
+        // download them, then retry compaction
+        for round in 0..ROUNDS {
+            // should we error out with the most specific error?
+            let last_round = round == ROUNDS - 1;
+
+            let res = self.compact_inner(ctx).await;
+
+            // If `create_image_layers' or `compact_level0` scheduled any
+            // uploads or deletions, but didn't update the index file yet,
+            // do it now.
+            //
+            // This isn't necessary for correctness, the remote state is
+            // consistent without the uploads and deletions, and we would
+            // update the index file on next flush iteration too. But it
+            // could take a while until that happens.
+            //
+            // Additionally, only do this on the terminal round before sleeping.
+            if last_round {
+                if let Some(remote_client) = &self.remote_client {
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+
+            let rls = match res {
+                Ok(()) => return Ok(()),
+                Err(CompactionError::DownloadRequired(rls)) if !last_round => {
+                    // this can be done at most one time before exiting, waiting
+                    rls
+                }
+                Err(CompactionError::DownloadRequired(rls)) => {
+                    anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
+                }
+                Err(CompactionError::Other(e)) => {
+                    return Err(e);
+                }
+            };
+
+            // this path can be visited in the second round of retrying, if first one found that we
+            // must first download some remote layers
+            let total = rls.len();
+
+            let mut downloads = rls
+                .into_iter()
+                .map(|rl| self.download_remote_layer(rl))
+                .collect::<futures::stream::FuturesUnordered<_>>();
+
+            let mut failed = 0;
+
+            let cancelled = task_mgr::shutdown_watcher();
+            tokio::pin!(cancelled);
+
+            loop {
+                tokio::select! {
+                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
+                    res = downloads.next() => {
+                        match res {
+                            Some(Ok(())) => {},
+                            Some(Err(e)) => {
+                                warn!("Downloading remote layer for compaction failed: {e:#}");
+                                failed += 1;
+                            }
+                            None => break,
+                        }
+                    }
+                }
+            }
+
+            if failed != 0 {
+                anyhow::bail!("{failed} out of {total} layers failed to download, retrying later");
+            }
+
+            // if everything downloaded fine, lets try again
+        }
+
+        unreachable!("retry loop exits")
+    }
+
+    /// Compaction which might need to be retried after downloading remote layers.
+    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
        //
        // High level strategy for compaction / image creation:
        //
@@ -660,7 +743,7 @@ impl Timeline {
        // Is the timeline being deleted?
        let state = *self.state.borrow();
        if state == TimelineState::Stopping {
-            anyhow::bail!("timeline is Stopping");
+            return Err(anyhow::anyhow!("timeline is Stopping").into());
        }

        let target_file_size = self.get_checkpoint_distance();
@@ -680,7 +763,8 @@ impl Timeline {
                // "enough".
                let layer_paths_to_upload = self
                    .create_image_layers(&partitioning, lsn, false, ctx)
-                    .await?;
+                    .await
+                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
                    for (path, layer_metadata) in layer_paths_to_upload {
                        remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -692,18 +776,6 @@ impl Timeline {
                self.compact_level0(&layer_removal_cs, target_file_size, ctx)
                    .await?;
                timer.stop_and_record();
-
-                // If `create_image_layers' or `compact_level0` scheduled any
-                // uploads or deletions, but didn't update the index file yet,
-                // do it now.
-                //
-                // This isn't necessary for correctness, the remote state is
-                // consistent without the uploads and deletions, and we would
-                // update the index file on next flush iteration too. But it
-                // could take a while until that happens.
-                if let Some(remote_client) = &self.remote_client {
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -2536,10 +2608,13 @@ impl Timeline {
    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
        {
            let partitioning_guard = self.partitioning.lock().unwrap();
-            if partitioning_guard.1 != Lsn(0)
-                && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
-            {
-                // no repartitioning needed
+            let distance = lsn.0 - partitioning_guard.1 .0;
+            if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
+                debug!(
+                    distance,
+                    threshold = self.repartition_threshold,
+                    "no repartitioning needed"
+                );
                return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
            }
        }
@@ -2557,8 +2632,12 @@ impl Timeline {

    // Is it time to create a new image layer for the given partition?
    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
+        let threshold = self.get_image_creation_threshold();
+
        let layers = self.layers.read().unwrap();

+        let mut max_deltas = 0;
+
        for part_range in &partition.ranges {
            let image_coverage = layers.image_coverage(part_range, lsn)?;
            for (img_range, last_img) in image_coverage {
@@ -2580,21 +2659,25 @@ impl Timeline {
                // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                if img_lsn < lsn {
-                    let threshold = self.get_image_creation_threshold();
                    let num_deltas =
                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;

-                    debug!(
-                        "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
-                        img_range.start, img_range.end, num_deltas, img_lsn, lsn
-                    );
+                    max_deltas = max_deltas.max(num_deltas);
                    if num_deltas >= threshold {
+                        debug!(
+                            "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
+                            img_range.start, img_range.end, num_deltas, img_lsn, lsn
+                        );
                        return Ok(true);
                    }
                }
            }
        }

+        debug!(
+            max_deltas,
+            "none of the partitioned ranges had >= {threshold} deltas"
+        );
        Ok(false)
    }

@@ -2707,25 +2790,55 @@ impl Timeline {
        Ok(layer_paths_to_upload)
    }
 }
+
 #[derive(Default)]
 struct CompactLevel0Phase1Result {
    new_layers: Vec<DeltaLayer>,
    deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
 }

+/// Top-level failure to compact.
+#[derive(Debug)]
+enum CompactionError {
+    /// L0 compaction requires layers to be downloaded.
+    ///
+    /// This should not happen repeatedly, but will be retried once by top-level
+    /// `Timeline::compact`.
+    DownloadRequired(Vec<Arc<RemoteLayer>>),
+    /// Compaction cannot be done right now; page reconstruction and so on.
+    Other(anyhow::Error),
+}
+
+impl From<anyhow::Error> for CompactionError {
+    fn from(value: anyhow::Error) -> Self {
+        CompactionError::Other(value)
+    }
+}
+
 impl Timeline {
+    /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
+    ///
+    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
+    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
+    /// start of level0 files compaction, the on-demand download should be revisited as well.
    async fn compact_level0_phase1(
        &self,
+        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
        target_file_size: u64,
        ctx: &RequestContext,
-    ) -> anyhow::Result<CompactLevel0Phase1Result> {
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
        let layers = self.layers.read().unwrap();
        let mut level0_deltas = layers.get_level0_deltas()?;
        drop(layers);

        // Only compact if enough layers have accumulated.
-        if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
-            return Ok(Default::default());
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
        }

        // Gather the files to compact in this iteration.
@@ -2761,6 +2874,24 @@ impl Timeline {
            end: deltas_to_compact.last().unwrap().get_lsn_range().end,
        };

+        let remotes = deltas_to_compact
+            .iter()
+            .filter(|l| l.is_remote_layer())
+            .inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
+            .map(|l| {
+                l.clone()
+                    .downcast_remote_layer()
+                    .expect("just checked it is remote layer")
+            })
+            .collect::<Vec<_>>();
+
+        if !remotes.is_empty() {
+            // caller is holding the lock to layer_removal_cs, and we don't want to download while
+            // holding that; in future download_remote_layer might take it as well. this is
+            // regardless of earlier image creation downloading on-demand, while holding the lock.
+            return Err(CompactionError::DownloadRequired(remotes));
+        }
+
        info!(
            "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
            lsn_range.start,
@@ -2768,9 +2899,11 @@ impl Timeline {
            deltas_to_compact.len(),
            level0_deltas.len()
        );
+
        for l in deltas_to_compact.iter() {
            info!("compact includes {}", l.filename().file_name());
        }
+
        // We don't need the original list of layers anymore. Drop it so that
        // we don't accidentally use it later in the function.
        drop(level0_deltas);
@@ -2940,7 +3073,9 @@ impl Timeline {
            }

            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
+                return Err(
+                    anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(),
+                );
            });

            writer.as_mut().unwrap().put_value(key, lsn, value)?;
@@ -2959,7 +3094,7 @@ impl Timeline {

            // Fsync all the layer files and directory using multiple threads to
            // minimize latency.
-            par_fsync::par_fsync(&layer_paths)?;
+            par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;

            layer_paths.pop().unwrap();
        }
@@ -2981,11 +3116,13 @@ impl Timeline {
        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
        target_file_size: u64,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
        let CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact,
-        } = self.compact_level0_phase1(target_file_size, ctx).await?;
+        } = self
+            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
+            .await?;

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
            // nothing to do
@@ -3009,7 +3146,12 @@ impl Timeline {
        for l in new_layers {
            let new_delta_path = l.path();

-            let metadata = new_delta_path.metadata()?;
+            let metadata = new_delta_path.metadata().with_context(|| {
+                format!(
+                    "read file metadata for new created layer {}",
+                    new_delta_path.display()
+                )
+            })?;

            if let Some(remote_client) = &self.remote_client {
                remote_client.schedule_layer_file_upload(
@@ -3243,7 +3385,7 @@ impl Timeline {

        let mut layers_to_remove = Vec::new();

-        // Scan all on-disk layers in the timeline.
+        // Scan all layers in the timeline (remote or on-disk).
        //
        // Garbage collect the layer if all conditions are satisfied:
        // 1. it is older than cutoff LSN;
@@ -3482,14 +3624,26 @@ impl Timeline {
        &self,
        remote_layer: Arc<RemoteLayer>,
    ) -> anyhow::Result<()> {
+        use std::sync::atomic::Ordering::Relaxed;
+
        let permit = match Arc::clone(&remote_layer.ongoing_download)
            .acquire_owned()
            .await
        {
            Ok(permit) => permit,
            Err(_closed) => {
-                info!("download of layer has already finished");
-                return Ok(());
+                if remote_layer.download_replacement_failure.load(Relaxed) {
+                    // this path will be hit often, in case there are upper retries. however
+                    // hitting this error will prevent a busy loop between get_reconstruct_data and
+                    // download, so an error is prefered.
+                    //
+                    // TODO: we really should poison the timeline, but panicking is not yet
+                    // supported. Related: https://github.com/neondatabase/neon/issues/3621
+                    anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
+                } else {
+                    info!("download of layer has already finished");
+                    return Ok(());
+                }
            }
        };

@@ -3525,8 +3679,8 @@ impl Timeline {
                    {
                        use crate::tenant::layer_map::Replacement;
                        let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        match updates.replace_historic(&l, new_layer) {
-                            Ok(Replacement::Replaced { .. }) => { /* expected */ }
+                        let failure = match updates.replace_historic(&l, new_layer) {
+                            Ok(Replacement::Replaced { .. }) => false,
                            Ok(Replacement::NotFound) => {
                                // TODO: the downloaded file should probably be removed, otherwise
                                // it will be added to the layermap on next load? we should
@@ -3534,6 +3688,7 @@ impl Timeline {
                                //
                                // See: https://github.com/neondatabase/neon/issues/3533
                                error!("replacing downloaded layer into layermap failed because layer was not found");
+                                true
                            }
                            Ok(Replacement::RemovalBuffered) => {
                                unreachable!("current implementation does not remove anything")
@@ -3552,12 +3707,32 @@ impl Timeline {
                                    ?other,
                                    "replacing downloaded layer into layermap failed because another layer was found instead of expected"
                                );
+                                true
                            }
                            Err(e) => {
                                // this is a precondition failure, the layer filename derived
                                // attributes didn't match up, which doesn't seem likely.
-                                error!("replacing downloaded layer into layermap failed: {e:#?}")
+                                error!("replacing downloaded layer into layermap failed: {e:#?}");
+                                true
                            }
+                        };
+
+                        if failure {
+                            // mark the remote layer permanently failed; the timeline is most
+                            // likely unusable after this. sadly we cannot just poison the layermap
+                            // lock with panic, because that would create an issue with shutdown.
+                            //
+                            // this does not change the retry semantics on failed downloads.
+                            //
+                            // use of Relaxed is valid because closing of the semaphore gives
+                            // happens-before and wakes up any waiters; we write this value before
+                            // and any waiters (or would be waiters) will load it after closing
+                            // semaphore.
+                            //
+                            // See: https://github.com/neondatabase/neon/issues/3533
+                            remote_layer
+                                .download_replacement_failure
+                                .store(true, Relaxed);
                        }
                    }
                    updates.flush();
@@ -3583,7 +3758,7 @@ impl Timeline {
                drop(permit);

                Ok(())
-            },
+            }.in_current_span(),
        );

        receiver.await.context("download task cancelled")?
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -41,9 +41,23 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        use crate::tenant::tasks::random_init_delay;
+        {
+            let policy = self.get_eviction_policy();
+            let period = match policy {
+                EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
+                EvictionPolicy::NoEviction => Duration::from_secs(10),
+            };
+            if random_init_delay(period, &cancel).await.is_err() {
+                info!("shutting down");
+                return;
+            }
+        }
+
        loop {
            let policy = self.get_eviction_policy();
            let cf = self.eviction_iteration(&policy, cancel.clone()).await;
+
            match cf {
                ControlFlow::Break(()) => break,
                ControlFlow::Continue(sleep_until) => {
@@ -78,13 +92,7 @@ impl Timeline {
                    ControlFlow::Continue(()) => (),
                }
                let elapsed = start.elapsed();
-                if elapsed > p.period {
-                    warn!(
-                        configured_period = %humantime::format_duration(p.period),
-                        last_period = %humantime::format_duration(elapsed),
-                        "this eviction period took longer than the configured period"
-                    );
-                }
+                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
                ControlFlow::Continue(start + p.period)
            }
        }
@@ -100,7 +108,6 @@ impl Timeline {
        #[allow(dead_code)]
        #[derive(Debug, Default)]
        struct EvictionStats {
-            not_considered_due_to_clock_skew: usize,
            candidates: usize,
            evicted: usize,
            errors: usize,
@@ -129,9 +136,21 @@ impl Timeline {
                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
                    Err(_e) => {
-                        // NB: don't log the error. If there are many layers and the system clock
-                        // is skewed, we'd be flooding the log.
-                        stats.not_considered_due_to_clock_skew += 1;
+                        // We reach here if `now` < `last_activity_ts`, which can legitimately
+                        // happen if there is an access between us getting `now`, and us getting
+                        // the access stats from the layer.
+                        //
+                        // The other reason why it can happen is system clock skew because
+                        // SystemTime::now() is not monotonic, so, even if there is no access
+                        // to the layer after we get `now` at the beginning of this function,
+                        // it could be that `now`  < `last_activity_ts`.
+                        //
+                        // To distinguish the cases, we would need to record `Instant`s in the
+                        // access stats (i.e., monotonic timestamps), but then, the timestamps
+                        // values in the access stats would need to be `Instant`'s, and hence
+                        // they would be meaningless outside of the pageserver process.
+                        // At the time of writing, the trade-off is that access stats are more
+                        // valuable than detecting clock skew.
                        continue;
                    }
                };
@@ -188,8 +207,9 @@ impl Timeline {
                }
            }
        }
-        if stats.not_considered_due_to_clock_skew > 0 || stats.errors > 0 || stats.not_evictable > 0
-        {
+        if stats.candidates == stats.not_evictable {
+            debug!(stats=?stats, "eviction iteration complete");
+        } else if stats.errors > 0 || stats.not_evictable > 0 {
            warn!(stats=?stats, "eviction iteration complete");
        } else {
            info!(stats=?stats, "eviction iteration complete");
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -27,8 +27,7 @@ use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
-use std::os::fd::RawFd;
-use std::os::unix::io::AsRawFd;
+use std::os::unix::io::{AsRawFd, RawFd};
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
--- a/poetry.lock
+++ b/poetry.lock
@@ -1238,7 +1238,6 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
    {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
 ]

@@ -1309,66 +1308,63 @@ files = [

 [[package]]
 name = "moto"
-version = "3.1.18"
-description = "A library that allows your python tests to easily mock out the boto library"
+version = "4.1.2"
+description = ""
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
-    {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
+    {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
+    {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
 ]

 [package.dependencies]
 aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
 boto3 = ">=1.9.201"
 botocore = ">=1.12.201"
-cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""}
+cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
 cryptography = ">=3.3.1"
 docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
 ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
-flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""}
+flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
-idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""}
 Jinja2 = ">=2.10.1"
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
-MarkupSafe = "!=2.0.0a1"
 openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
 pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
 python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
-pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
-responses = ">=0.9.0"
+responses = ">=0.13.0"
 setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
-werkzeug = ">=0.5,<2.2.0"
+werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
 xmltodict = "*"

 [package.extras]
-all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 apigatewayv2 = ["PyYAML (>=5.1)"]
 appsync = ["graphql-core"]
 awslambda = ["docker (>=2.5.1)"]
 batch = ["docker (>=2.5.1)"]
-cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 ds = ["sshpubkeys (>=3.1.0)"]
 dynamodb = ["docker (>=2.5.1)"]
-dynamodb2 = ["docker (>=2.5.1)"]
 dynamodbstreams = ["docker (>=2.5.1)"]
 ebs = ["sshpubkeys (>=3.1.0)"]
 ec2 = ["sshpubkeys (>=3.1.0)"]
 efs = ["sshpubkeys (>=3.1.0)"]
+eks = ["sshpubkeys (>=3.1.0)"]
 glue = ["pyparsing (>=3.0.7)"]
 iotdata = ["jsondiff (>=1.1.2)"]
 route53resolver = ["sshpubkeys (>=3.1.0)"]
 s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-ssm = ["PyYAML (>=5.1)", "dataclasses"]
+server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+ssm = ["PyYAML (>=5.1)"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]

 [[package]]
@@ -1716,7 +1712,6 @@ python-versions = ">=3.6"
 files = [
    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -1750,7 +1745,6 @@ files = [
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -1762,7 +1756,6 @@ files = [
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -1795,7 +1788,18 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
+    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
+    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
+    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
+    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
+    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
+    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
+    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
+    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
+    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
+    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
+    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]

@@ -2004,8 +2008,8 @@ files = [

 [package.dependencies]
 pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
+    {version = ">=5.0", markers = "python_version < \"3.10\""},
 ]

 [[package]]
@@ -2082,18 +2086,6 @@ cryptography = ["cryptography (>=3.4.0)"]
 pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
 pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]

-[[package]]
-name = "pytz"
-version = "2022.1"
-description = "World timezone definitions, modern and historical"
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
-    {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -2129,13 +2121,6 @@ files = [
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
-    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
-    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2449,16 +2434,19 @@ test = ["websockets"]

 [[package]]
 name = "werkzeug"
-version = "2.1.2"
+version = "2.2.3"
 description = "The comprehensive WSGI web application library."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
-    {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
+    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
+    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
 ]

+[package.dependencies]
+MarkupSafe = ">=2.1.1"
+
 [package.extras]
 watchdog = ["watchdog"]

@@ -2655,4 +2643,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3"
+content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -28,6 +28,7 @@ itertools.workspace = true
 md5.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry.workspace = true
 parking_lot.workspace = true
 pin-project-lite.workspace = true
 pq_proto.workspace = true
@@ -35,6 +36,8 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
+reqwest-middleware.workspace = true
+reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -48,8 +51,10 @@ thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["signal"] }
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 tracing.workspace = true
 url.workspace = true
 utils.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,11 +1,11 @@
 mod classic;
-
+mod hacks;
 mod link;
-use futures::TryFutureExt;
+
 pub use link::LinkAuthError;

 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, ClientCredentials},
    console::{
        self,
        provider::{CachedNodeInfo, ConsoleReqExtra},
@@ -13,9 +13,10 @@ use crate::{
    },
    stream, url,
 };
+use futures::TryFutureExt;
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::info;

 /// A product of successful authentication.
 pub struct AuthSuccess<T> {
@@ -105,97 +106,49 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
    }
 }

-// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc).
-// Read more: https://github.com/rust-lang/rust/issues/99190
-// Alleged fix: https://github.com/rust-lang/rust/pull/89056
-impl<'l> BackendType<'l, ClientCredentials<'_>> {
-    /// Do something special if user didn't provide the `project` parameter.
-    async fn try_password_hack<'a>(
-        &'a mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> auth::Result<Option<AuthSuccess<CachedNodeInfo>>> {
-        use BackendType::*;
-
-        // If there's no project so far, that entails that client doesn't
-        // support SNI or other means of passing the project name.
-        // We now expect to see a very specific payload in the place of password.
-        let fetch_magic_payload = |client| async {
-            warn!("project name not specified, resorting to the password hack auth flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::PasswordHack)
-                .await?
-                .authenticate()
-                .await?;
-
-            info!(project = &payload.project, "received missing parameter");
-            auth::Result::Ok(payload)
-        };
-
-        // If we want to use cleartext password flow, we can read the password
-        // from the client and pretend that it's a magic payload (PasswordHack hack).
-        let fetch_plaintext_password = |client| async {
-            info!("using cleartext password flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::CleartextPassword)
-                .await?
-                .authenticate()
-                .await?;
-
-            auth::Result::Ok(auth::password_hack::PasswordHackPayload {
-                project: String::new(),
-                password: payload,
-            })
-        };
-
-        // TODO: find a proper way to merge those very similar blocks.
-        let (mut node, password) = match self {
-            Console(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            // This is a hack to allow cleartext password in secure connections (wss).
-            Console(api, creds) if creds.use_cleartext_password_flow => {
-                let payload = fetch_plaintext_password(client).await?;
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            Postgres(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            _ => return Ok(None),
-        };
-
-        node.config.password(password);
-        Ok(Some(AuthSuccess {
-            reported_auth_ok: false,
-            value: node,
-        }))
+/// True to its name, this function encapsulates our current auth trade-offs.
+/// Here, we choose the appropriate auth flow based on circumstances.
+async fn auth_quirks(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    allow_cleartext: bool,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    // If there's no project so far, that entails that client doesn't
+    // support SNI or other means of passing the endpoint (project) name.
+    // We now expect to see a very specific payload in the place of password.
+    if creds.project.is_none() {
+        // Password will be checked by the compute node later.
+        return hacks::password_hack(api, extra, creds, client).await;
    }

+    // Password hack should set the project name.
+    // TODO: make `creds.project` more type-safe.
+    assert!(creds.project.is_some());
+
+    // Perform cleartext auth if we're allowed to do that.
+    // Currently, we use it for websocket connections (latency).
+    if allow_cleartext {
+        // Password will be checked by the compute node later.
+        return hacks::cleartext_hack(api, extra, creds, client).await;
+    }
+
+    // Finally, proceed with the main auth flow (SCRAM-based).
+    classic::authenticate(api, extra, creds, client).await
+}
+
+impl BackendType<'_, ClientCredentials<'_>> {
    /// Authenticate the client via the requested backend, possibly using credentials.
-    pub async fn authenticate<'a>(
+    #[tracing::instrument(fields(allow_cleartext), skip_all)]
+    pub async fn authenticate(
        &mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        extra: &ConsoleReqExtra<'_>,
+        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        allow_cleartext: bool,
    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
        use BackendType::*;

-        // Handle cases when `project` is missing in `creds`.
-        // TODO: type safety: return `creds` with irrefutable `project`.
-        if let Some(res) = self.try_password_hack(extra, client).await? {
-            info!("user successfully authenticated (using the password hack)");
-            return Ok(res);
-        }
-
        let res = match self {
            Console(api, creds) => {
                info!(
@@ -204,20 +157,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
                    "performing authentication using the console"
                );

-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
            }
            Postgres(api, creds) => {
-                info!("performing mock authentication using a local postgres instance");
+                info!(
+                    user = creds.user,
+                    project = creds.project(),
+                    "performing authentication using a local postgres instance"
+                );

-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");

-                link::handle_user(url, client)
+                link::authenticate(url, client)
                    .await?
                    .map(CachedNodeInfo::new_uncached)
            }
@@ -229,9 +186,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {

    /// When applicable, wake the compute node, gaining its connection info in the process.
    /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute<'a>(
+    pub async fn wake_compute(
        &self,
-        extra: &'a ConsoleReqExtra<'a>,
+        extra: &ConsoleReqExtra<'_>,
    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
        use BackendType::*;

--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -9,7 +9,7 @@ use crate::{
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;

-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &ClientCredentials<'_>,
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -0,0 +1,66 @@
+use super::AuthSuccess;
+use crate::{
+    auth::{self, AuthFlow, ClientCredentials},
+    console::{
+        self,
+        provider::{CachedNodeInfo, ConsoleReqExtra},
+    },
+    stream,
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, warn};
+
+/// Compared to [SCRAM](crate::scram), cleartext password auth saves
+/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
+/// These properties are benefical for serverless JS workers, so we
+/// use this mechanism for websocket connections.
+pub async fn cleartext_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("cleartext auth flow override is enabled, proceeding");
+    let password = AuthFlow::new(client)
+        .begin(auth::CleartextPassword)
+        .await?
+        .authenticate()
+        .await?;
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
+
+/// Workaround for clients which don't provide an endpoint (project) name.
+/// Very similar to [`cleartext_hack`], but there's a specific password format.
+pub async fn password_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("project not specified, resorting to the password hack auth flow");
+    let payload = AuthFlow::new(client)
+        .begin(auth::PasswordHack)
+        .await?
+        .authenticate()
+        .await?;
+
+    info!(project = &payload.project, "received missing parameter");
+    creds.project = Some(payload.project.into());
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(payload.password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -53,7 +53,7 @@ pub fn new_psql_session_id() -> String {
    hex::encode(rand::random::<[u8; 8]>())
 }

-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<AuthSuccess<NodeInfo>> {
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,12 +11,16 @@ pub enum ClientCredsParseError {
    #[error("Parameter '{0}' is missing in startup packet.")]
    MissingKey(&'static str),

-    #[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)]
+    #[error(
+        "Inconsistent project name inferred from \
+         SNI ('{}') and project option ('{}').",
+        .domain, .option,
+    )]
    InconsistentProjectNames { domain: String, option: String },

    #[error(
        "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
-        SNI should be formatted as '<project-name>.{}'.",
+         SNI should be formatted as '<project-name>.{}'.",
        .sni, .cn, .cn,
    )]
    InconsistentSni { sni: String, cn: String },
@@ -34,9 +38,6 @@ pub struct ClientCredentials<'a> {
    pub user: &'a str,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
    pub project: Option<Cow<'a, str>>,
-    /// If `True`, we'll use the old cleartext password flow. This is used for
-    /// websocket connections, which want to minimize the number of round trips.
-    pub use_cleartext_password_flow: bool,
 }

 impl ClientCredentials<'_> {
@@ -51,7 +52,6 @@ impl<'a> ClientCredentials<'a> {
        params: &'a StartupMessageParams,
        sni: Option<&str>,
        common_name: Option<&str>,
-        use_cleartext_password_flow: bool,
    ) -> Result<Self, ClientCredsParseError> {
        use ClientCredsParseError::*;

@@ -96,18 +96,9 @@ impl<'a> ClientCredentials<'a> {
        }
        .transpose()?;

-        info!(
-            user = user,
-            project = project.as_deref(),
-            use_cleartext_password_flow = use_cleartext_password_flow,
-            "credentials"
-        );
+        info!(user, project = project.as_deref(), "credentials");

-        Ok(Self {
-            user,
-            project,
-            use_cleartext_password_flow,
-        })
+        Ok(Self { user, project })
    }
 }

@@ -131,7 +122,7 @@ mod tests {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);

-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);

@@ -146,7 +137,7 @@ mod tests {
            ("foo", "bar"),        // should be ignored
        ]);

-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);

@@ -160,7 +151,7 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_name = Some("localhost");

-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));

@@ -174,7 +165,7 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));

@@ -188,7 +179,7 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_name = Some("localhost");

-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("baz"));

@@ -203,8 +194,7 @@ mod tests {
        let sni = Some("second.localhost");
        let common_name = Some("localhost");

-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -221,8 +211,7 @@ mod tests {
        let sni = Some("project.localhost");
        let common_name = Some("example.com");

-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
        match err {
            InconsistentSni { sni, cn } => {
                assert_eq!(sni, "project.localhost");
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -77,14 +77,9 @@ impl ConnCfg {
            self.dbname(dbname);
        }

-        if let Some(options) = params.options_raw() {
-            // We must drop all proxy-specific parameters.
-            #[allow(unstable_name_collisions)]
-            let options: String = options
-                .filter(|opt| !opt.starts_with("project="))
-                .intersperse(" ") // TODO: use impl from std once it's stabilized
-                .collect();
-
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
            self.options(&options);
        }

@@ -225,3 +220,46 @@ impl ConnCfg {
        Ok(connection)
    }
 }
+
+/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+    #[allow(unstable_name_collisions)]
+    let options: String = params
+        .options_raw()?
+        .filter(|opt| !opt.starts_with("project="))
+        .intersperse(" ") // TODO: use impl from std once it's stabilized
+        .collect();
+
+    // Don't even bother with empty options.
+    if options.is_empty() {
+        return None;
+    }
+
+    Some(options)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_filtered_options() {
+        // Empty options is unlikely to be useful anyway.
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // It's likely that clients will only use options to specify endpoint/project.
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // Same, because unescaped whitespaces are no-op.
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
+
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+    }
+}
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -8,6 +8,7 @@ pub struct ProxyConfig {
    pub metric_collection: Option<MetricCollectionConfig>,
 }

+#[derive(Debug)]
 pub struct MetricCollectionConfig {
    pub endpoint: reqwest::Url,
    pub interval: Duration,
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -5,10 +5,7 @@ use crate::{
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{
-    net::{TcpListener, TcpStream},
-    thread,
-};
+use std::{net::TcpStream, thread};
 use tracing::{error, info, info_span};
 use utils::{
    postgres_backend::{self, AuthType, PostgresBackend},
@@ -34,23 +31,24 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N
    CPLANE_WAITERS.notify(psql_session_id, msg)
 }

-/// Console management API listener thread.
+/// Console management API listener task.
 /// It spawns console response handlers needed for the link auth.
-pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
+pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("mgmt has shut down");
    }

-    listener
-        .set_nonblocking(false)
-        .context("failed to set listener to blocking")?;
-
    loop {
-        let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
+        let (socket, peer_addr) = listener.accept().await?;
        info!("accepted connection from {peer_addr}");
+
+        let socket = socket.into_std()?;
        socket
            .set_nodelay(true)
            .context("failed to set client socket option")?;
+        socket
+            .set_nonblocking(false)
+            .context("failed to set client socket option")?;

        // TODO: replace with async tasks.
        thread::spawn(move || {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -11,8 +11,10 @@ use async_trait::async_trait;
 use std::sync::Arc;

 pub mod errors {
-    use crate::error::{io_error, UserFacingError};
-    use reqwest::StatusCode as HttpStatusCode;
+    use crate::{
+        error::{io_error, UserFacingError},
+        http,
+    };
    use thiserror::Error;

    /// A go-to error message which doesn't leak any detail.
@@ -24,7 +26,7 @@ pub mod errors {
        /// Error returned by the console itself.
        #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
        Console {
-            status: HttpStatusCode,
+            status: http::StatusCode,
            text: Box<str>,
        },

@@ -35,7 +37,7 @@ pub mod errors {

    impl ApiError {
        /// Returns HTTP status code if it's the reason for failure.
-        pub fn http_status_code(&self) -> Option<HttpStatusCode> {
+        pub fn http_status_code(&self) -> Option<http::StatusCode> {
            use ApiError::*;
            match self {
                Console { status, .. } => Some(*status),
@@ -51,15 +53,15 @@ pub mod errors {
                // To minimize risks, only select errors are forwarded to users.
                // Ask @neondatabase/control-plane for review before adding more.
                Console { status, .. } => match *status {
-                    HttpStatusCode::NOT_FOUND => {
+                    http::StatusCode::NOT_FOUND => {
                        // Status 404: failed to get a project-related resource.
                        format!("{REQUEST_FAILED}: endpoint cannot be found")
                    }
-                    HttpStatusCode::NOT_ACCEPTABLE => {
+                    http::StatusCode::NOT_ACCEPTABLE => {
                        // Status 406: endpoint is disabled (we don't allow connections).
                        format!("{REQUEST_FAILED}: endpoint is disabled")
                    }
-                    HttpStatusCode::LOCKED => {
+                    http::StatusCode::LOCKED => {
                        // Status 423: project might be in maintenance mode (or bad state).
                        format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
                    }
@@ -70,13 +72,18 @@ pub mod errors {
        }
    }

-    // Helps eliminate graceless `.map_err` calls without introducing another ctor.
    impl From<reqwest::Error> for ApiError {
        fn from(e: reqwest::Error) -> Self {
            io_error(e).into()
        }
    }

+    impl From<reqwest_middleware::Error> for ApiError {
+        fn from(e: reqwest_middleware::Error) -> Self {
+            io_error(e).into()
+        }
+    }
+
    #[derive(Debug, Error)]
    pub enum GetAuthInfoError {
        // We shouldn't include the actual secret here.
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,6 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use reqwest::StatusCode as HttpStatusCode;
 use tracing::{error, info, info_span, warn, Instrument};

 #[derive(Clone)]
@@ -52,7 +51,7 @@ impl Api {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
                Err(e) => match e.http_status_code() {
-                    Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
+                    Some(http::StatusCode::NOT_FOUND) => return Ok(None),
                    _otherwise => return Err(e.into()),
                },
            };
@@ -154,7 +153,7 @@ impl super::Api for Api {

 /// Parse http response body, taking status code into account.
 async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: reqwest::Response,
+    response: http::Response,
 ) -> Result<T, ApiError> {
    let status = response.status();
    if status.is_success() {
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -1,7 +1,24 @@
+//! HTTP client and server impls.
+//! Other modules should use stuff from this module instead of
+//! directly relying on deps like `reqwest` (think loose coupling).
+
 pub mod server;
 pub mod websocket;

+pub use reqwest::{Request, Response, StatusCode};
+pub use reqwest_middleware::{ClientWithMiddleware, Error};
+
 use crate::url::ApiUrl;
+use reqwest_middleware::RequestBuilder;
+
+/// This is the preferred way to create new http clients,
+/// because it takes care of observability (OpenTelemetry).
+/// We deliberately don't want to replace this with a public static.
+pub fn new_client() -> ClientWithMiddleware {
+    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+        .with(reqwest_tracing::TracingMiddleware::default())
+        .build()
+}

 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
@@ -9,13 +26,17 @@ pub struct Endpoint {
    /// API's base URL.
    endpoint: ApiUrl,
    /// Connection manager with built-in pooling.
-    client: reqwest::Client,
+    client: ClientWithMiddleware,
 }

 impl Endpoint {
    /// Construct a new HTTP endpoint wrapper.
-    pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self {
-        Self { endpoint, client }
+    /// Http client is not constructed under the hood so that it can be shared.
+    pub fn new(endpoint: ApiUrl, client: impl Into<ClientWithMiddleware>) -> Self {
+        Self {
+            endpoint,
+            client: client.into(),
+        }
    }

    #[inline(always)]
@@ -23,19 +44,16 @@ impl Endpoint {
        &self.endpoint
    }

-    /// Return a [builder](reqwest::RequestBuilder) for a `GET` request,
+    /// Return a [builder](RequestBuilder) for a `GET` request,
    /// appending a single `path` segment to the base endpoint URL.
-    pub fn get(&self, path: &str) -> reqwest::RequestBuilder {
+    pub fn get(&self, path: &str) -> RequestBuilder {
        let mut url = self.endpoint.clone();
        url.path_segments_mut().push(path);
        self.client.get(url.into_inner())
    }

    /// Execute a [request](reqwest::Request).
-    pub async fn execute(
-        &self,
-        request: reqwest::Request,
-    ) -> Result<reqwest::Response, reqwest::Error> {
+    pub async fn execute(&self, request: Request) -> Result<Response, Error> {
        self.client.execute(request).await
    }
 }
@@ -43,11 +61,12 @@ impl Endpoint {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use reqwest::Client;

    #[test]
    fn optional_query_params() -> anyhow::Result<()> {
        let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());

        // Validate that this pattern makes sense.
        let req = endpoint
@@ -66,7 +85,7 @@ mod tests {
    #[test]
    fn uuid_params() -> anyhow::Result<()> {
        let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());

        let req = endpoint
            .get("frobnicate")
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -186,8 +186,8 @@ async fn ws_handler(
 }

 pub async fn task_main(
-    ws_listener: TcpListener,
    config: &'static ProxyConfig,
+    ws_listener: TcpListener,
 ) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("websocket server has shut down");
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -0,0 +1,47 @@
+use tracing_opentelemetry::OpenTelemetryLayer;
+use tracing_subscriber::{
+    filter::{EnvFilter, LevelFilter},
+    prelude::*,
+};
+
+/// Initialize logging and OpenTelemetry tracing and exporter.
+///
+/// Logging can be configured using `RUST_LOG` environment variable.
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the
+/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
+/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
+pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let env_filter = EnvFilter::builder()
+        .with_default_directive(LevelFilter::INFO.into())
+        .from_env_lossy();
+
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_writer(std::io::stderr)
+        .with_target(false);
+
+    let otlp_layer = tracing_utils::init_tracing("proxy")
+        .await
+        .map(OpenTelemetryLayer::new);
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(otlp_layer)
+        .with(fmt_layer)
+        .try_init()?;
+
+    Ok(LoggingGuard)
+}
+
+pub struct LoggingGuard;
+
+impl Drop for LoggingGuard {
+    fn drop(&mut self) {
+        // Shutdown trace pipeline gracefully, so that it has a chance to send any
+        // pending traces before we exit.
+        tracing::info!("shutting down the tracing machinery");
+        tracing_utils::shutdown_tracing();
+    }
+}
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -12,6 +12,7 @@ mod config;
 mod console;
 mod error;
 mod http;
+mod logging;
 mod metrics;
 mod parse;
 mod proxy;
@@ -27,7 +28,7 @@ use config::ProxyConfig;
 use futures::FutureExt;
 use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
-use tracing::{info, info_span, Instrument};
+use tracing::{info, warn};
 use utils::{project_git_version, sentry_init::init_sentry};

 project_git_version!(GIT_VERSION);
@@ -41,8 +42,8 @@ async fn flatten_err(

 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    // First, initialize logging and troubleshooting subsystems.
-    init_tracing();
+    let _logging_guard = logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

    info!("Version: {GIT_VERSION}");
@@ -60,16 +61,17 @@ async fn main() -> anyhow::Result<()> {

    let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
    info!("Starting mgmt on {mgmt_address}");
-    let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;
+    let mgmt_listener = TcpListener::bind(mgmt_address).await?;

    let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
    info!("Starting proxy on {proxy_address}");
    let proxy_listener = TcpListener::bind(proxy_address).await?;

    let mut tasks = vec![
+        tokio::spawn(handle_signals()),
        tokio::spawn(http::server::task_main(http_listener)),
        tokio::spawn(proxy::task_main(config, proxy_listener)),
-        tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)),
+        tokio::spawn(console::mgmt::task_main(mgmt_listener)),
    ];

    if let Some(wss_address) = args.get_one::<String>("wss") {
@@ -78,48 +80,50 @@ async fn main() -> anyhow::Result<()> {
        let wss_listener = TcpListener::bind(wss_address).await?;

        tasks.push(tokio::spawn(http::websocket::task_main(
-            wss_listener,
            config,
+            wss_listener,
        )));
    }

-    // TODO: refactor.
-    if let Some(metric_collection) = &config.metric_collection {
-        let hostname = hostname::get()?
-            .into_string()
-            .map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
-
-        tasks.push(tokio::spawn(
-            metrics::collect_metrics(
-                &metric_collection.endpoint,
-                metric_collection.interval,
-                hostname,
-            )
-            .instrument(info_span!("collect_metrics")),
-        ));
+    if let Some(metrics_config) = &config.metric_collection {
+        tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
    }

-    // This will block until all tasks have completed.
-    // Furthermore, the first one to fail will cancel the rest.
+    // This combinator will block until either all tasks complete or
+    // one of them finishes with an error (others will be cancelled).
    let tasks = tasks.into_iter().map(flatten_err);
    let _: Vec<()> = futures::future::try_join_all(tasks).await?;

    Ok(())
 }

-/// Tracing is used for logging and telemetry.
-fn init_tracing() {
-    tracing_subscriber::fmt()
-        .with_env_filter({
-            // This filter will examine the `RUST_LOG` env variable.
-            use tracing_subscriber::filter::{EnvFilter, LevelFilter};
-            EnvFilter::builder()
-                .with_default_directive(LevelFilter::INFO.into())
-                .from_env_lossy()
-        })
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_target(false)
-        .init();
+/// Handle unix signals appropriately.
+async fn handle_signals() -> anyhow::Result<()> {
+    use tokio::signal::unix::{signal, SignalKind};
+
+    let mut hangup = signal(SignalKind::hangup())?;
+    let mut interrupt = signal(SignalKind::interrupt())?;
+    let mut terminate = signal(SignalKind::terminate())?;
+
+    loop {
+        tokio::select! {
+            // Hangup is commonly used for config reload.
+            _ = hangup.recv() => {
+                warn!("received SIGHUP; config reload is not supported");
+            }
+            // Shut down the whole application.
+            _ = interrupt.recv() => {
+                warn!("received SIGINT, exiting immediately");
+                bail!("interrupted");
+            }
+            // TODO: Don't accept new proxy connections.
+            // TODO: Shut down once all exisiting connections have been closed.
+            _ = terminate.recv() => {
+                warn!("received SIGTERM, exiting immediately");
+                bail!("terminated");
+            }
+        }
+    }
 }

 /// ProxyConfig is created at proxy startup, and lives forever.
@@ -161,7 +165,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
            }));

            let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
-            let endpoint = http::Endpoint::new(url, reqwest::Client::new());
+            let endpoint = http::Endpoint::new(url, http::new_client());

            let api = console::provider::neon::Api::new(endpoint, caches);
            auth::BackendType::Console(Cow::Owned(api), ())
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,12 +1,11 @@
-//!
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-//!
+use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::{collections::HashMap, time::Duration};
-use tracing::{debug, error, log::info, trace};
+use std::collections::HashMap;
+use tracing::{debug, error, info, instrument, trace};

 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

@@ -19,48 +18,41 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
 ///
-#[derive(Eq, Hash, PartialEq, Serialize)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
 pub struct Ids {
    pub endpoint_id: String,
 }

-pub async fn collect_metrics(
-    metric_collection_endpoint: &reqwest::Url,
-    metric_collection_interval: Duration,
-    hostname: String,
-) -> anyhow::Result<()> {
+pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
+    info!("metrics collector config: {config:?}");
    scopeguard::defer! {
-        info!("collect_metrics has shut down");
+        info!("metrics collector has shut down");
    }

-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
-    info!(
-        "starting collect_metrics. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
-    // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let http_client = http::new_client();
    let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
+    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();

+    let mut ticker = tokio::time::interval(config.interval);
    loop {
-        tokio::select! {
-            _ = ticker.tick() => {
+        ticker.tick().await;

-                match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
-                {
-                    Err(e) => {
-                        error!("Failed to send consumption metrics: {} ", e);
-                    },
-                    Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
-                }
-            }
+        let res = collect_metrics_iteration(
+            &http_client,
+            &mut cached_metrics,
+            &config.endpoint,
+            &hostname,
+        )
+        .await;
+
+        match res {
+            Err(e) => error!("failed to send consumption metrics: {e} "),
+            Ok(_) => trace!("periodic metrics collection completed successfully"),
        }
    }
 }

-pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
+fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
    let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
    let metrics = prometheus::default_registry().gather();

@@ -99,11 +91,12 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
    current_metrics
 }

-pub async fn collect_metrics_iteration(
-    client: &reqwest::Client,
+#[instrument(skip_all)]
+async fn collect_metrics_iteration(
+    client: &http::ClientWithMiddleware,
    cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
    metric_collection_endpoint: &reqwest::Url,
-    hostname: String,
+    hostname: &str,
 ) -> anyhow::Result<()> {
    info!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
@@ -134,7 +127,7 @@ pub async fn collect_metrics_iteration(
                    stop_time: *curr_time,
                },
                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname.clone()),
+                idempotency_key: idempotency_key(hostname.to_owned()),
                value,
                extra: Ids {
                    endpoint_id: curr_key.endpoint_id.clone(),
@@ -190,6 +183,12 @@ pub async fn collect_metrics_iteration(
            }
        } else {
            error!("metrics endpoint refused the sent metrics: {:?}", res);
+            for metric in chunk.iter() {
+                // Report if the metric value is suspiciously large
+                if metric.value > (1u64 << 40) {
+                    error!("potentially abnormal metric value: {:?}", metric);
+                }
+            }
        }
    }
    Ok(())
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -17,7 +17,7 @@ use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};

 /// Number of times we should retry the `/proxy_wake_compute` http request.
 const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
@@ -91,13 +91,13 @@ pub async fn task_main(
            .unwrap_or_else(|e| {
                // Acknowledge that the task has finished with an error.
                error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(info_span!("client", session = format_args!("{session_id}"))),
+            }),
        );
    }
 }

 // TODO(tech debt): unite this with its twin below.
+#[tracing::instrument(fields(session_id), skip_all)]
 pub async fn handle_ws_client(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
@@ -127,7 +127,7 @@ pub async fn handle_ws_client(
        let result = config
            .auth_backend
            .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name))
            .transpose();

        async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -135,10 +135,11 @@ pub async fn handle_ws_client(

    let client = Client::new(stream, creds, &params, session_id);
    cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, true))
        .await
 }

+#[tracing::instrument(fields(session_id), skip_all)]
 async fn handle_client(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
@@ -165,7 +166,7 @@ async fn handle_client(
        let result = config
            .auth_backend
            .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
            .transpose();

        async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -173,7 +174,7 @@ async fn handle_client(

    let client = Client::new(stream, creds, &params, session_id);
    cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, false))
        .await
 }

@@ -401,7 +402,11 @@ impl<'a, S> Client<'a, S> {

 impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
    /// Let the client authenticate and connect to the designated compute node.
-    async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> {
+    async fn connect_to_db(
+        self,
+        session: cancellation::Session<'_>,
+        allow_cleartext: bool,
+    ) -> anyhow::Result<()> {
        let Self {
            mut stream,
            mut creds,
@@ -416,10 +421,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {

        let auth_result = async {
            // `&mut stream` doesn't let us merge those 2 lines.
-            let res = creds.authenticate(&extra, &mut stream).await;
+            let res = creds
+                .authenticate(&extra, &mut stream, allow_cleartext)
+                .await;
+
            async { res }.or_else(|e| stream.throw_error(e)).await
        }
-        .instrument(info_span!("auth"))
        .await?;

        let AuthSuccess {
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,12 +19,12 @@ types-requests = "^2.28.5"
 types-psycopg2 = "^2.9.18"
 boto3 = "^1.26.16"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
-moto = {version = "^3.0.0", extras = ["server"]}
+moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^1.11.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "2.1.2"
+Werkzeug = "^2.2.3"
 pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -126,7 +126,12 @@ fn main() -> anyhow::Result<()> {
        return Ok(());
    }

+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
    logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
    info!("version: {GIT_VERSION}");

    let args_workdir = &args.datadir;
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -424,12 +424,16 @@ async fn http1_handler(

 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
    let args = Args::parse();

+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
    logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
    info!("version: {GIT_VERSION}");
    ::metrics::set_build_info_metric(GIT_VERSION);

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1217,7 +1217,7 @@ class PageserverHttpClient(requests.Session):
        """
        Returns the tenant size, together with the model inputs as the second tuple item.
        """
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size")
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
        self.verbose_error(res)
        res = res.json()
        assert isinstance(res, dict)
@@ -1228,6 +1228,16 @@ class PageserverHttpClient(requests.Session):
        assert type(inputs) is dict
        return (size, inputs)

+    def tenant_size_debug(self, tenant_id: TenantId) -> str:
+        """
+        Returns the tenant size debug info, as an HTML string
+        """
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
+            headers={"Accept": "text/html"},
+        )
+        return res.text
+
    def timeline_list(
        self,
        tenant_id: TenantId,
@@ -1278,6 +1288,7 @@ class PageserverHttpClient(requests.Session):
        timeline_id: TimelineId,
        include_non_incremental_logical_size: bool = False,
        include_timeline_dir_layer_file_size_sum: bool = False,
+        **kwargs,
    ) -> Dict[Any, Any]:
        params = {}
        if include_non_incremental_logical_size:
@@ -1288,6 +1299,7 @@ class PageserverHttpClient(requests.Session):
        res = self.get(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
            params=params,
+            **kwargs,
        )
        self.verbose_error(res)
        res_json = res.json()
@@ -2067,6 +2079,7 @@ class NeonPageserver(PgProtocol):
            ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
+            ".*task iteration took longer than the configured period.*",
        ]

    def start(
@@ -2517,15 +2530,17 @@ class NeonProxy(PgProtocol):
        tb: Optional[TracebackType],
    ):
        if self._popen is not None:
-            # NOTE the process will die when we're done with tests anyway, because
-            # it's a child process. This is mostly to clean up in between different tests.
-            self._popen.kill()
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warn("failed to gracefully terminate proxy; killing")
+                self._popen.kill()

    @staticmethod
    async def activate_link_auth(
        local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True
    ):
-
        pg_user = "proxy"

        if create_user:
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -1,19 +1,25 @@
 # It's possible to run any regular test with the local fs remote storage via
 # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......

+import time
+from collections import defaultdict
 from pathlib import Path
+from typing import Any, DefaultDict, Dict

 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
+    PageserverApiException,
    RemoteStorageKind,
    assert_tenant_status,
    available_remote_storages,
+    wait_for_last_flush_lsn,
    wait_for_last_record_lsn,
    wait_for_sk_commit_lsn_to_reach_remote_storage,
    wait_for_upload,
    wait_until,
+    wait_until_tenant_state,
 )
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar
@@ -212,6 +218,9 @@ def test_ondemand_download_timetravel(
    log.info(filled_size)
    assert filled_current_physical == filled_size, "we don't yet do layer eviction"

+    # Wait until generated image layers are uploaded to S3
+    time.sleep(3)
+
    env.pageserver.stop()

    # remove all the layer files
@@ -449,3 +458,233 @@ def test_download_remote_layers_api(
    pg_old = env.postgres.create_start(branch_name="main")
    with pg_old.cursor() as cur:
        assert query_scalar(cur, "select count(*) from testtab") == table_len
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_without_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create a few layers, then evict, then make sure compaction runs successfully.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand_without_image_creation",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # unused, because manual will be called after each table
+        "checkpoint_distance": 100 * 1024**2,
+        # this will be updated later on to allow manual compaction outside of checkpoints
+        "compaction_threshold": 100,
+        # repartitioning parameter, not required here
+        "image_creation_threshold": 100,
+        # repartitioning parameter, not required here
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    with env.postgres.create_start("main") as pg:
+        # no particular reason to create the layers like this, but we are sure
+        # not to hit the image_creation_threshold here.
+        with pg.cursor() as cur:
+            cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        with pg.cursor() as cur:
+            cur.execute("update a set id = -id")
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+    assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas"
+
+    for layer in layers.historic_layers:
+        log.info(f"pre-compact:  {layer}")
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    for layer in layers.historic_layers:
+        log.info(f"post compact: {layer}")
+    assert len(layers.historic_layers) == 1, "should have compacted to single layer"
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_with_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted.
+
+    Due to current implementation, this will make image creation on-demand download layers, but we cannot really
+    directly test for it.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # repartitioning threshold is this / 10, but it doesn't really seem to matter
+        "checkpoint_distance": 50 * 1024**2,
+        "compaction_threshold": 3,
+        # important: keep this high for the data ingestion
+        "image_creation_threshold": 100,
+        # repartitioning parameter, unused
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    pg = env.postgres.create_start("main")
+
+    # no particular reason to create the layers like this, but we are sure
+    # not to hit the image_creation_threshold here.
+    with pg.cursor() as cur:
+        cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
+        cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    for i in range(0, 2):
+        for j in range(0, 3):
+            # create a minimal amount of "delta difficulty" for this table
+            with pg.cursor() as cur:
+                cur.execute("update a set some_value = -some_value + %s", (j,))
+
+            with pg.cursor() as cur:
+                # vacuuming should aid to reuse keys, though it's not really important
+                # with image_creation_threshold=1 which we will use on the last compaction
+                cur.execute("vacuum")
+
+            wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+
+            if i == 1 and j == 2:
+                # last iteration; stop before checkpoint to avoid leaving an inmemory layer
+                pg.stop_and_destroy()
+
+            pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        # images should not yet be created, because threshold is too high,
+        # but these will be reshuffled to L1 layers
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+    for _ in range(0, 20):
+        # loop in case flushing is still in progress
+        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+        if not layers.in_memory_layers:
+            break
+        time.sleep(0.2)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+
+    kinds_before: DefaultDict[str, int] = defaultdict(int)
+
+    for layer in layers.historic_layers:
+        kinds_before[layer.kind] += 1
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    assert dict(kinds_before) == {"Delta": 4}
+
+    # now having evicted all layers, reconfigure to have lower image creation
+    # threshold to expose image creation to downloading all of the needed
+    # layers -- threshold of 2 would sound more reasonable, but keeping it as 1
+    # to be less flaky
+    env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    kinds_after: DefaultDict[str, int] = defaultdict(int)
+    for layer in layers.historic_layers:
+        kinds_after[layer.kind] += 1
+
+    assert dict(kinds_after) == {"Delta": 4, "Image": 1}
+
+
+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ondemand_download_failure_to_replace(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
+
+    See: https://github.com/neondatabase/neon/issues/3533
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_failure_to_replace",
+    )
+
+    # disable gc and compaction via default tenant config because config is lost while detaching
+    # so that compaction will not be the one to download the layer but the http handler is
+    neon_env_builder.pageserver_config_override = (
+        """tenant_config={gc_period = "0s", compaction_period = "0s"}"""
+    )
+
+    env = neon_env_builder.init_start()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    # remove layers so that they will be redownloaded
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
+    pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
+
+    # requesting details with non-incremental size should trigger a download of the only layer
+    # this will need to be adjusted if an index for logical sizes is ever implemented
+    with pytest.raises(PageserverApiException):
+        # error message is not useful
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
+
+    actual_message = (
+        ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
+    )
+    assert env.pageserver.log_contains(actual_message) is not None
+    env.pageserver.allowed_errors.append(actual_message)
+
+    env.pageserver.allowed_errors.append(
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
+    )
+    # this might get to run and attempt on-demand, but not always
+    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
+
+    # if the above returned, then we didn't have a livelock, and all is well
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,13 +1,13 @@
-from typing import Any, List, Tuple
+from pathlib import Path
+from typing import List, Tuple

 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.types import Lsn


-def test_empty_tenant_size(neon_simple_env: NeonEnv):
+def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
    env = neon_simple_env
    (tenant_id, _) = env.neon_cli.create_tenant()
    http_client = env.pageserver.http_client()
@@ -18,6 +18,9 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):

    main_branch_name = "main"

+    branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
+    assert branch_name == main_branch_name
+
    with env.postgres.create_start(
        main_branch_name,
        tenant_id=tenant_id,
@@ -39,12 +42,44 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
    assert size == initial_size, "tenant_size should not be affected by shutdown of compute"

-    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
-    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
-    assert actual_commands == expected_commands
+    expected_inputs = {
+        "segments": [
+            {
+                "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchStart",
+            },
+            {
+                "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchEnd",
+            },
+        ],
+        "timeline_inputs": [
+            {
+                "timeline_id": f"{main_timeline_id}",
+                "ancestor_id": None,
+                "ancestor_lsn": "0/0",
+                "last_record": "0/1698CC0",
+                "latest_gc_cutoff": "0/1698C48",
+                "horizon_cutoff": "0/0",
+                "pitr_cutoff": "0/0",
+                "next_gc_cutoff": "0/0",
+                "retention_param_cutoff": None,
+            }
+        ],
+    }
+    expected_inputs = mask_model_inputs(expected_inputs)
+    actual_inputs = mask_model_inputs(inputs)
+
+    assert expected_inputs == actual_inputs
+
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)


-def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path):
    """
    Issue found in production. Because the ancestor branch was under
    gc_horizon, the branchpoint was "dangling" and the computation could not be
@@ -75,8 +110,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):

    assert size_after_branching > initial_size

+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)

-def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_output_dir: Path):
    """
    More general version of test_branched_empty_timeline_size

@@ -128,9 +167,13 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
    size_after_writes = http_client.tenant_size(tenant_id)
    assert size_after_writes > initial_size

+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+

@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
    """
    gc_horizon = 15

@@ -167,9 +210,13 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv):

    assert size_before_branching < size_after

+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+

@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_parent_within_horizon(neon_simple_env: NeonEnv):
+def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
    """
    gc_horizon = 5

@@ -179,7 +226,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
    """

    env = neon_simple_env
-    gc_horizon = 200_000
+    gc_horizon = 5_000
    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
    http_client = env.pageserver.http_client()

@@ -212,9 +259,13 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):

    assert size_before_branching < size_after

+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+

@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
    """
    gc_horizon = small

@@ -253,8 +304,14 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv):

            latest_size = size_now

+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)

-def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
+
+def test_single_branch_get_tenant_size_grows(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
    """
    Operate on single branch reading the tenants size after each transaction.
    """
@@ -279,7 +336,20 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

    collected_responses: List[Tuple[Lsn, int]] = []

+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+
+    def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
+        if current_lsn - initdb_lsn > gc_horizon:
+            assert (
+                size >= prev
+            ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
+        else:
+            assert (
+                size > prev
+            ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
+
    with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
        with pg.cursor() as cur:
            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")

@@ -297,13 +367,19 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)

-            size = http_client.tenant_size(tenant_id)
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+
+            size_debug = http_client.tenant_size_debug(tenant_id)
+            size_debug_file.write(size_debug)

            if len(collected_responses) > 0:
                prev = collected_responses[-1][1]
                if size == 0:
                    assert prev == 0
                else:
+                    # branch start shouldn't be past gc_horizon yet
+                    # thus the size should grow as we insert more data
+                    assert current_lsn - initdb_lsn <= gc_horizon
                    assert size > prev

            collected_responses.append((current_lsn, size))
@@ -323,9 +399,15 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)

-            size = http_client.tenant_size(tenant_id)
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+
+            size_debug = http_client.tenant_size_debug(tenant_id)
+            size_debug_file.write(size_debug)
+
            prev = collected_responses[-1][1]
-            assert size > prev, "tenant_size should grow with updates"
+
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
            collected_responses.append((current_lsn, size))

        while True:
@@ -340,9 +422,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

            size = http_client.tenant_size(tenant_id)
            prev = collected_responses[-1][1]
-            assert (
-                size > prev
-            ), "even though rows have been deleted, the tenant_size should increase"
+
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
            collected_responses.append((current_lsn, size))

        with pg.cursor() as cur:
@@ -352,7 +434,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

        size = http_client.tenant_size(tenant_id)
        prev = collected_responses[-1][1]
-        assert size > prev, "dropping table grows tenant_size"
+
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
        collected_responses.append((current_lsn, size))

    # this isn't too many lines to forget for a while. observed while
@@ -364,24 +448,17 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
    env.pageserver.stop()
    env.pageserver.start()

+    size_debug_file.close()
+
    size_after = http_client.tenant_size(tenant_id)
    prev = collected_responses[-1][1]

    assert size_after == prev, "size after restarting pageserver should not have changed"

-    ps_metrics = parse_metrics(http_client.get_metrics(), "pageserver")
-    tenant_metric_filter = {
-        "tenant_id": str(tenant_id),
-    }

-    tenant_size_metric = int(
-        ps_metrics.query_one("pageserver_tenant_synthetic_size", filter=tenant_metric_filter).value
-    )
-
-    assert tenant_size_metric == size_after, "API size value should be equal to metric size value"
-
-
-def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder):
+def test_get_tenant_size_with_multiple_branches(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
    """
    Reported size goes up while branches or rows are being added, goes down after removing branches.
    """
@@ -481,6 +558,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
    size_after = http_client.tenant_size(tenant_id)
    assert size_after == size_after_thinning_branch

+    size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file_before.write(size_debug)
+
    # teardown, delete branches, and the size should be going down
    http_client.timeline_delete(tenant_id, first_branch_timeline_id)

@@ -493,3 +574,38 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder

    assert size_after_deleting_second < size_after_continuing_on_main
    assert size_after_deleting_second > size_after_first_branch
+
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
+
+# Helper for tests that compare timeline_inputs
+# We don't want to compare the exact values, because they can be unstable
+# and cause flaky tests. So replace the values with useful invariants.
+def mask_model_inputs(x):
+    if isinstance(x, dict):
+        newx = {}
+        for k, v in x.items():
+            if k == "size":
+                if v is None or v == 0:
+                    # no change
+                    newx[k] = v
+                elif v < 0:
+                    newx[k] = "<0"
+                else:
+                    newx[k] = ">0"
+            elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record":
+                if v is None or v == 0 or v == "0/0":
+                    # no change
+                    newx[k] = v
+                else:
+                    newx[k] = "masked"
+            else:
+                newx[k] = mask_model_inputs(v)
+        return newx
+    elif isinstance(x, list):
+        newlist = [mask_model_inputs(v) for v in x]
+        return newlist
+    else:
+        return x
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -280,6 +280,7 @@ def test_tenant_upgrades_index_json_from_v0(

        timeline_file.seek(0)
        json.dump(v0_index_part, timeline_file)
+        timeline_file.truncate(timeline_file.tell())

    env.pageserver.start()
    pageserver_http = env.pageserver.http_client()
--- a/vm-cgconfig.conf
+++ b/vm-cgconfig.conf
@@ -0,0 +1,12 @@
+# Configuration for cgroups in VM compute nodes
+group neon-postgres {
+    perm {
+        admin {
+            uid = vm-informant;
+        }
+        task {
+            gid = users;
+        }
+    }
+    memory {}
+}
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,14 +38,14 @@ prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
 tonic = { version = "0.8", features = ["tls-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }