empty to retrigger ci

Fix #3136 : test_tenant_detach_smoke now does not do compaction before GC
The test started failing after 6dec85b19d because a single call to a gc HTTP endpoint was prepended with a compaction endpoint. The former does create a task which detachment waits for, the latter does not. This test does not seem to care about gc outcome at all, so removing compaction seems safe. The underlying issue is still there: the compaction HTTP endpoint can race with tenant detachment. We only use the endpoint in testing, so it may be worth checking other tests altered in the aforementioned commit.
2026-05-19 06:00:38 +00:00 · 2023-03-10 14:21:08 +02:00 · 2022-12-29 01:19:57 +02:00 · 2022-12-28 15:12:06 +02:00 · 2022-12-28 09:20:01 +02:00 · 2022-12-27 20:19:12 +03:00
162 changed files with 6584 additions and 2988 deletions
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -14,7 +14,7 @@
 - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
 - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
 - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
+- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
+- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)

 <!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->
--- a/.github/ansible/neon-stress.hosts.yaml
+++ b/.github/ansible/neon-stress.hosts.yaml
@@ -1,32 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-storage-ireland
-    bucket_region: eu-west-1
-    console_mgmt_base_url: http://neon-stress-console.local
-    broker_endpoint: http://storage-broker.neon-stress.local:50051
-    safekeeper_enable_s3_offload: 'false'
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: neon-stress/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-  children:
-    pageservers:
-      hosts:
-        neon-stress-ps-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-ps-2:
-          console_region_id: aws-eu-west-1
-    safekeepers:
-      hosts:
-        neon-stress-sk-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-2:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-3:
-          console_region_id: aws-eu-west-1
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: neon-prod-storage-ap-southeast-1
    bucket_region: ap-southeast-1
    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: neon-prod-storage-eu-central-1
    bucket_region: eu-central-1
    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: neon-prod-storage-us-east-2
    bucket_region: us-east-2
    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: neon-prod-storage-us-west-2
    bucket_region: us-west-2
    console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.eta.us-west-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -34,5 +34,5 @@ storage:
          console_region_id: aws-us-west-2
        zenith-1-sk-2:
          console_region_id: aws-us-west-2
-        zenith-1-sk-3:
+        zenith-1-sk-4:
          console_region_id: aws-us-west-2
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -3,9 +3,11 @@ storage:
    bucket_name: neon-dev-storage-eu-west-1
    bucket_region: eu-west-1
    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -1,35 +0,0 @@
-storage:
-  vars:
-    bucket_name: zenith-staging-storage-us-east-1
-    bucket_region: us-east-1
-    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: http://storage-broker.staging.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: us-stage/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-
-  children:
-    pageservers:
-      hosts:
-        zenith-us-stage-ps-2:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-3:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-4:
-          console_region_id: aws-us-east-1
-
-    safekeepers:
-      hosts:
-        zenith-us-stage-sk-4:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-5:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-6:
-          console_region_id: aws-us-east-1
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,9 +3,11 @@ storage:
    bucket_name: neon-staging-storage-us-east-2
    bucket_region: us-east-2
    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
  neon_env: staging
  neon_service: storage-broker

-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
  annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051

+ingress:
+  enabled: false

 metrics:
  enabled: false
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
  neon_env: staging
  neon_service: storage-broker

-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
  annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.beta.us-east-2.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051

+ingress:
+  enabled: false

 metrics:
  enabled: false
--- a/.github/helm-values/neon-stress.neon-storage-broker.yaml
+++ b/.github/helm-values/neon-stress.neon-storage-broker.yaml
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: neon-stress
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -1,52 +0,0 @@
-fullnameOverride: "neon-stress-proxy-scram"
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://neon-stress-console.local/management/api/v2"
-  domain: "*.stress.neon.tech"
-  sentryEnvironment: "development"
-
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ b/.github/helm-values/neon-stress.proxy.yaml
@@ -1,61 +0,0 @@
-fullnameOverride: "neon-stress-proxy"
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.dev.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-service:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
-  type: LoadBalancer
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
  neon_env: production
  neon_service: storage-broker

-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
  annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051

+ingress:
+  enabled: false

 metrics:
  enabled: false
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
  neon_env: production
  neon_service: storage-broker

-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
  annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051

+ingress:
+  enabled: false

 metrics:
  enabled: false
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
  neon_env: production
  neon_service: storage-broker

-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
  annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.delta.us-east-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051

+ingress:
+  enabled: false

 metrics:
  enabled: false
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
  neon_env: production
  neon_service: storage-broker

-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
  annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.eta.us-west-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051

+ingress:
+  enabled: false

 metrics:
  enabled: false
--- a/.github/helm-values/staging.neon-storage-broker.yaml
+++ b/.github/helm-values/staging.neon-storage-broker.yaml
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: staging
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ b/.github/helm-values/staging.proxy-scram.yaml
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
--- a/.github/helm-values/staging.proxy.yaml
+++ b/.github/helm-values/staging.proxy.yaml
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.stage.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -18,6 +18,7 @@ on:
      region_id:
        description: 'Use a particular region. If not set the default region will be used'
        required: false
+        default: 'aws-us-east-2'
      save_perf_report:
        type: boolean
        description: 'Publish perf report or not. If not set, the report is published only for the main branch'
@@ -115,13 +116,10 @@ jobs:
        # neon-captest-prefetch: Same, with prefetching enabled (new project)
        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
+        platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
        db_size: [ 10gb ]
        runner: [ us-east-2 ]
        include:
-          - platform: neon-captest-new
-            db_size: 50gb
-            runner: us-east-2
          - platform: neon-captest-prefetch
            db_size: 50gb
            runner: us-east-2
@@ -409,7 +407,7 @@ jobs:

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    timeout-minutes: 360 # 6h
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -555,10 +555,14 @@ jobs:
      - name: Kaniko build compute tools
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

-  compute-node-image-v14:
+  compute-node-image:
    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -573,32 +577,40 @@ jobs:
      - name: Configure ECR login
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

-      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
+      - name: Kaniko build compute node with extensions
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

-  compute-node-image-v15:
+  vm-compute-node-image:
    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
+    needs: [ tag, compute-node-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
    defaults:
      run:
        shell: sh -eu {0}

    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
+      - name: Downloading latest vm-builder
+        run: |
+          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          chmod +x vm-builder

-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Pulling compute-node image
+        run: |
+          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

-      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
+      - name: Build vm image
+        run: |
+          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Pushing vm-compute-node image
+        run: |
+          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, dev, x64 ]

    steps:
@@ -642,13 +654,13 @@ jobs:

  promote-images:
    runs-on: [ self-hosted, dev, x64 ]
-    needs: [ tag, test-images ]
+    needs: [ tag, test-images, vm-compute-node-image ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
    strategy:
      fail-fast: false
      matrix:
-        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]

    steps:
      - name: Promote image to latest
@@ -681,9 +693,15 @@ jobs:
      - name: Pull compute node v14 image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14

+      - name: Pull vm compute node v14 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+
      - name: Pull compute node v15 image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15

+      - name: Pull vm compute node v15 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+
      - name: Pull rust image from ECR
        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust

@@ -695,7 +713,9 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -712,9 +732,15 @@ jobs:
      - name: Push compute node v14 image to Docker Hub
        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}

+      - name: Push vm compute node v14 image to Docker Hub
+        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+
      - name: Push compute node v15 image to Docker Hub
        run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}

+      - name: Push vm compute node v15 image to Docker Hub
+        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+
      - name: Push rust image to Docker Hub
        run: crane push rust neondatabase/rust:pinned

@@ -726,26 +752,25 @@ jobs:
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest

  calculate-deploy-targets:
    runs-on: [ self-hosted, dev, x64 ]
    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
      github.event_name != 'workflow_dispatch'
    outputs:
      matrix-include: ${{ steps.set-matrix.outputs.include }}
    steps:
      - id: set-matrix
        run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            echo "include=[$STAGING]" >> $GITHUB_OUTPUT
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
            exit 1
          fi

@@ -756,7 +781,7 @@ jobs:
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
      github.event_name != 'workflow_dispatch'
    defaults:
      run:
@@ -800,7 +825,7 @@ jobs:
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'main') &&
      github.event_name != 'workflow_dispatch'
@@ -912,7 +937,7 @@ jobs:
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
      github.event_name != 'workflow_dispatch'
    defaults:
      run:
@@ -955,7 +980,7 @@ jobs:
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
      github.event_name != 'workflow_dispatch'
    defaults:
      run:
@@ -1072,7 +1097,7 @@ jobs:

      - name: Deploy storage-broker
        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s

  deploy-proxy-prod-new:
    runs-on: prod
@@ -1149,7 +1174,7 @@ jobs:

      - name: Deploy storage-broker
        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s

  promote-compatibility-data:
    runs-on: [ self-hosted, dev, x64 ]
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -563,6 +563,12 @@ version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"

+[[package]]
+name = "base64"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -1920,7 +1926,7 @@ version = "8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 "pem",
 "ring",
 "serde",
@@ -2409,6 +2415,7 @@ dependencies = [
 "rand",
 "regex",
 "remote_storage",
+ "reqwest",
 "rstar",
 "scopeguard",
 "serde",
@@ -2507,7 +2514,7 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]

 [[package]]
@@ -2528,18 +2535,18 @@ dependencies = [

 [[package]]
 name = "phf"
-version = "0.10.1"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
+checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
 dependencies = [
 "phf_shared",
 ]

 [[package]]
 name = "phf_shared"
-version = "0.10.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
+checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
 dependencies = [
 "siphasher",
 ]
@@ -2612,12 +2619,12 @@ dependencies = [

 [[package]]
 name = "postgres"
-version = "0.19.2"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+version = "0.19.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
 "bytes",
 "fallible-iterator",
- "futures",
+ "futures-util",
 "log",
 "tokio",
 "tokio-postgres",
@@ -2626,9 +2633,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
- "base64",
+ "base64 0.20.0",
 "byteorder",
 "bytes",
 "fallible-iterator",
@@ -2643,8 +2650,8 @@ dependencies = [

 [[package]]
 name = "postgres-types"
-version = "0.2.3"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+version = "0.2.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2868,7 +2875,7 @@ dependencies = [
 "anyhow",
 "async-trait",
 "atty",
- "base64",
+ "base64 0.13.1",
 "bstr",
 "bytes",
 "clap 4.0.29",
@@ -3078,7 +3085,7 @@ version = "0.11.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 "bytes",
 "encoding_rs",
 "futures-core",
@@ -3261,7 +3268,7 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]

 [[package]]
@@ -3542,7 +3549,7 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 "chrono",
 "hex",
 "indexmap",
@@ -4009,14 +4016,15 @@ dependencies = [

 [[package]]
 name = "tokio-postgres"
-version = "0.7.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+version = "0.7.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
 "async-trait",
 "byteorder",
 "bytes",
 "fallible-iterator",
- "futures",
+ "futures-channel",
+ "futures-util",
 "log",
 "parking_lot 0.12.1",
 "percent-encoding",
@@ -4109,7 +4117,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "axum",
- "base64",
+ "base64 0.13.1",
 "bytes",
 "futures-core",
 "futures-util",
@@ -4351,7 +4359,7 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 "chunked_transfer",
 "log",
 "native-tls",
@@ -4746,6 +4754,7 @@ dependencies = [
 "ahash",
 "anyhow",
 "bytes",
+ "chrono",
 "clap 4.0.29",
 "crossbeam-utils",
 "either",
@@ -4769,6 +4778,7 @@ dependencies = [
 "reqwest",
 "scopeguard",
 "serde",
+ "serde_json",
 "socket2",
 "stable_deref_trait",
 "syn",
@@ -4787,7 +4797,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8"
 dependencies = [
 "asn1-rs",
- "base64",
+ "base64 0.13.1",
 "data-encoding",
 "der-parser",
 "lazy_static",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,4 +86,4 @@ lto = true
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 [patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include

-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
@@ -207,7 +204,8 @@ RUN apt update &&  \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

 USER postgres
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include

-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
@@ -207,7 +204,8 @@ RUN apt update &&  \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

 USER postgres
--- a/199
+++ b/199
@@ -61,146 +61,115 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-v14-headers postgres-v15-headers
+neon: postgres-headers
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

 ### PostgreSQL parts
-# The rules are duplicated for Postgres v14 and 15. We may want to refactor
+# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
-$(POSTGRES_INSTALL_DIR)/build/v14/config.status:
-	+@echo "Configuring Postgres v14 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+	+@echo "Configuring Postgres $* build"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
+	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
 		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
-
-$(POSTGRES_INSTALL_DIR)/build/v15/config.status:
-	+@echo "Configuring Postgres v15 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
-		CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)

 # nicer alias to run 'configure'
-.PHONY: postgres-v14-configure
-postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
-
-.PHONY: postgres-v15-configure
-postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+# Note: I've been unable to use templates for this part of our configuration.
+# I'm not sure why it wouldn't work, but this is the only place (apart from
+# the "build-all-versions" entry points) where direct mention of PostgreSQL
+# versions is used.
+.PHONY: postgres-configure-v15
+postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+.PHONY: postgres-configure-v14
+postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status

 # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
-.PHONY: postgres-v14-headers
-postgres-v14-headers: postgres-v14-configure
-	+@echo "Installing PostgreSQL v14 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install
-
-.PHONY: postgres-v15-headers
-postgres-v15-headers: postgres-v15-configure
-	+@echo "Installing PostgreSQL v15 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install
+.PHONY: postgres-headers-%
+postgres-headers-%: postgres-configure-%
+	+@echo "Installing PostgreSQL $* headers"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install

 # Compile and install PostgreSQL
-.PHONY: postgres-v14
-postgres-v14: postgres-v14-configure \
-		  postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
-	+@echo "Compiling libpq v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install
+.PHONY: postgres-%
+postgres-%: postgres-configure-% \
+		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
+	+@echo "Compiling PostgreSQL $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
+	+@echo "Compiling libpq $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
+	+@echo "Compiling pg_buffercache $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install

-.PHONY: postgres-v15
-postgres-v15: postgres-v15-configure \
-		  postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
-	+@echo "Compiling libpq v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install
+.PHONY: postgres-clean-%
+postgres-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean

-# shorthand to build all Postgres versions
-postgres: postgres-v14 postgres-v15
+.PHONY: neon-pg-ext-%
+neon-pg-ext-%: postgres-%
+	+@echo "Compiling neon $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+	+@echo "Compiling neon_walredo $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_test_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install

-.PHONY: postgres-v14-clean
-postgres-v14-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean
+.PHONY: neon-pg-ext-clean-%
+neon-pg-ext-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean

-.PHONY: postgres-v15-clean
-postgres-v15-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean
-
-neon-pg-ext-v14: postgres-v14
-	+@echo "Compiling neon v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v14
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
-
-neon-pg-ext-v15: postgres-v15
-	+@echo "Compiling neon v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v15
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
+.PHONY: neon-pg-ext
+neon-pg-ext: \
+	neon-pg-ext-v14 \
+	neon-pg-ext-v15

 .PHONY: neon-pg-ext-clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
+neon-pg-ext-clean: \
+	neon-pg-ext-clean-v14 \
+	neon-pg-ext-clean-v15

-neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15
-postgres-headers: postgres-v14-headers postgres-v15-headers
-postgres-clean: postgres-v14-clean postgres-v15-clean
+# shorthand to build all Postgres versions
+.PHONY: postgres
+postgres: \
+	postgres-v14 \
+	postgres-v15
+
+.PHONY: postgres-headers
+postgres-headers: \
+	postgres-headers-v14 \
+	postgres-headers-v15
+
+.PHONY: postgres-clean
+postgres-clean: \
+	postgres-clean-v14 \
+	postgres-clean-v15

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
-clean:
-	cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean
-	cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean
+clean: postgres-clean neon-pg-ext-clean
 	$(CARGO_CMD_PREFIX) cargo clean
-	cd pgxn/neon && $(MAKE) clean
-	cd pgxn/neon_test_utils && $(MAKE) clean

 # This removes everything
 .PHONY: distclean
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -12,12 +12,12 @@ futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
 notify = "5.0.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -105,7 +105,7 @@ fn main() -> Result<()> {
        tenant,
        timeline,
        pageserver_connstr,
-        metrics: ComputeMetrics::new(),
+        metrics: ComputeMetrics::default(),
        state: RwLock::new(ComputeState::new()),
    };
    let compute = Arc::new(compute_state);
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -5,7 +5,7 @@ use tokio_postgres::NoTls;

 use crate::compute::ComputeNode;

-pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
+pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
    let query = "
    CREATE TABLE IF NOT EXISTS health_check (
        id serial primary key,
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,11 +23,11 @@ use std::sync::RwLock;

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::info;
+use log::{info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};

-use crate::checker::create_writablity_check_data;
+use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -91,7 +91,7 @@ pub enum ComputeStatus {
    Failed,
 }

-#[derive(Serialize)]
+#[derive(Default, Serialize)]
 pub struct ComputeMetrics {
    pub sync_safekeepers_ms: AtomicU64,
    pub basebackup_ms: AtomicU64,
@@ -99,23 +99,6 @@ pub struct ComputeMetrics {
    pub total_startup_ms: AtomicU64,
 }

-impl ComputeMetrics {
-    pub fn new() -> Self {
-        Self {
-            sync_safekeepers_ms: AtomicU64::new(0),
-            basebackup_ms: AtomicU64::new(0),
-            config_ms: AtomicU64::new(0),
-            total_startup_ms: AtomicU64::new(0),
-        }
-    }
-}
-
-impl Default for ComputeMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        self.state.write().unwrap().status = status;
@@ -175,7 +158,7 @@ impl ComputeNode {
        let start_time = Utc::now();

        let sync_handle = Command::new(&self.pgbin)
-            .args(&["--sync-safekeepers"])
+            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .stdout(Stdio::piped())
            .spawn()
@@ -253,7 +236,7 @@ impl ComputeNode {

        // Run postgres as a child process.
        let mut pg = Command::new(&self.pgbin)
-            .args(&["-D", &self.pgdata])
+            .args(["-D", &self.pgdata])
            .spawn()
            .expect("cannot start postgres process");

@@ -292,7 +275,7 @@ impl ComputeNode {
        handle_databases(&self.spec, &mut client)?;
        handle_role_deletions(self, &mut client)?;
        handle_grants(self, &mut client)?;
-        create_writablity_check_data(&mut client)?;
+        create_writability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -328,6 +311,9 @@ impl ComputeNode {
            .wait()
            .expect("failed to start waiting on Postgres process");

+        self.check_for_core_dumps()
+            .expect("failed to check for core dumps");
+
        Ok(ecode)
    }

@@ -343,4 +329,68 @@ impl ComputeNode {
        self.prepare_pgdata()?;
        self.run()
    }
+
+    // Look for core dumps and collect backtraces.
+    //
+    // EKS worker nodes have following core dump settings:
+    //   /proc/sys/kernel/core_pattern -> core
+    //   /proc/sys/kernel/core_uses_pid -> 1
+    //   ulimint -c -> unlimited
+    // which results in core dumps being written to postgres data directory as core.<pid>.
+    //
+    // Use that as a default location and pattern, except macos where core dumps are written
+    // to /cores/ directory by default.
+    fn check_for_core_dumps(&self) -> Result<()> {
+        let core_dump_dir = match std::env::consts::OS {
+            "macos" => Path::new("/cores/"),
+            _ => Path::new(&self.pgdata),
+        };
+
+        // Collect core dump paths if any
+        info!("checking for core dumps in {}", core_dump_dir.display());
+        let files = fs::read_dir(core_dump_dir)?;
+        let cores = files.filter_map(|entry| {
+            let entry = entry.ok()?;
+            let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
+            Some(entry.path())
+        });
+
+        // Print backtrace for each core dump
+        for core_path in cores {
+            warn!(
+                "core dump found: {}, collecting backtrace",
+                core_path.display()
+            );
+
+            // Try first with gdb
+            let backtrace = Command::new("gdb")
+                .args(["--batch", "-q", "-ex", "bt", &self.pgbin])
+                .arg(&core_path)
+                .output();
+
+            // Try lldb if no gdb is found -- that is handy for local testing on macOS
+            let backtrace = match backtrace {
+                Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    warn!("cannot find gdb, trying lldb");
+                    Command::new("lldb")
+                        .arg("-c")
+                        .arg(&core_path)
+                        .args(["--batch", "-o", "bt all", "-o", "quit"])
+                        .output()
+                }
+                _ => backtrace,
+            }?;
+
+            warn!(
+                "core dump backtrace: {}",
+                String::from_utf8_lossy(&backtrace.stdout)
+            );
+            warn!(
+                "debugger stderr: {}",
+                String::from_utf8_lossy(&backtrace.stderr)
+            );
+        }
+
+        Ok(())
+    }
 }
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -74,10 +74,8 @@ fn watch_compute_activity(compute: &ComputeNode) {
                        }
                    }

-                    // Sort idle backend `state_change` timestamps. The last one corresponds
-                    // to the last activity.
-                    idle_backs.sort();
-                    if let Some(last) = idle_backs.last() {
+                    // Get idle backend `state_change` with the max timestamp.
+                    if let Some(last) = idle_backs.iter().max() {
                        last_active = *last;
                    }
                }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -119,16 +119,9 @@ pub trait GenericOptionsSearch {
 impl GenericOptionsSearch for GenericOptions {
    /// Lookup option by name
    fn find(&self, name: &str) -> Option<String> {
-        match &self {
-            Some(ops) => {
-                let op = ops.iter().find(|s| s.name == name);
-                match op {
-                    Some(op) => op.value.clone(),
-                    None => None,
-                }
-            }
-            None => None,
-        }
+        let ops = self.as_ref()?;
+        let op = ops.iter().find(|s| s.name == name)?;
+        op.value.clone()
    }
 }

@@ -161,6 +154,14 @@ impl Role {
 }

 impl Database {
+    pub fn new(name: PgIdent, owner: PgIdent) -> Self {
+        Self {
+            name,
+            owner,
+            options: None,
+        }
+    }
+
    /// Serialize a list of database parameters into a Postgres-acceptable
    /// string of arguments.
    /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
@@ -219,11 +220,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
            &[],
        )?
        .iter()
-        .map(|row| Database {
-            name: row.get("datname"),
-            owner: row.get("owner"),
-            options: None,
-        })
+        .map(|row| Database::new(row.get("datname"), row.get("owner")))
        .collect();

    Ok(postgres_dbs)
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -38,4 +38,33 @@ mod pg_helpers_tests {

        assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
    }
+
+    #[test]
+    fn generic_options_search() {
+        let generic_options: GenericOptions = Some(vec![
+            GenericOption {
+                name: "present_value".into(),
+                value: Some("value".into()),
+                vartype: "string".into(),
+            },
+            GenericOption {
+                name: "missed_value".into(),
+                value: None,
+                vartype: "int".into(),
+            },
+        ]);
+        assert_eq!(generic_options.find("present_value"), Some("value".into()));
+        assert_eq!(generic_options.find("missed_value"), None);
+        assert_eq!(generic_options.find("invalid_value"), None);
+
+        let empty_generic_options: GenericOptions = Some(vec![]);
+        assert_eq!(empty_generic_options.find("present_value"), None);
+        assert_eq!(empty_generic_options.find("missed_value"), None);
+        assert_eq!(empty_generic_options.find("invalid_value"), None);
+
+        let none_generic_options: GenericOptions = None;
+        assert_eq!(none_generic_options.find("present_value"), None);
+        assert_eq!(none_generic_options.find("missed_value"), None);
+        assert_eq!(none_generic_options.find("invalid_value"), None);
+    }
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,7 +10,7 @@ comfy-table = "6.1"
 git-version = "0.3.5"
 nix = "0.25"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -549,7 +549,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {

            table.load_preset(comfy_table::presets::NOTHING);

-            table.set_header(&[
+            table.set_header([
                "NODE",
                "ADDRESS",
                "TIMELINE",
@@ -584,7 +584,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                    .map(|name| name.as_str())
                    .unwrap_or("?");

-                table.add_row(&[
+                table.add_row([
                    node_name.as_str(),
                    &node.address.to_string(),
                    &node.timeline_id.to_string(),
@@ -747,7 +747,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
    if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
        Ok(SafekeeperNode::from_env(env, node))
    } else {
-        bail!("could not find safekeeper '{}'", id)
+        bail!("could not find safekeeper {id}")
    }
 }

@@ -806,22 +806,22 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
 }

 fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    broker::start_broker_process(env)?;
-    let pageserver = PageServerNode::from_env(env);
-
    // Postgres nodes are not started automatically

+    broker::start_broker_process(env)?;
+
+    let pageserver = PageServerNode::from_env(env);
    if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver start failed: {e}");
-        try_stop_storage_broker_process(env);
+        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
+        try_stop_all(env, true);
        exit(1);
    }

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
        if let Err(e) = safekeeper.start() {
-            eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
-            try_stop_storage_broker_process(env);
+            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
+            try_stop_all(env, false);
            exit(1);
        }
    }
@@ -832,35 +832,41 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
    let immediate =
        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

+    try_stop_all(env, immediate);
+
+    Ok(())
+}
+
+fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    let pageserver = PageServerNode::from_env(env);

    // Stop all compute nodes
-    let cplane = ComputeControlPlane::load(env.clone())?;
-    for (_k, node) in cplane.nodes {
-        if let Err(e) = node.stop(false) {
-            eprintln!("postgres stop failed: {}", e);
+    match ComputeControlPlane::load(env.clone()) {
+        Ok(cplane) => {
+            for (_k, node) in cplane.nodes {
+                if let Err(e) = node.stop(false) {
+                    eprintln!("postgres stop failed: {e:#}");
+                }
+            }
+        }
+        Err(e) => {
+            eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}")
        }
    }

    if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver stop failed: {}", e);
+        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
    }

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
        if let Err(e) = safekeeper.stop(immediate) {
-            eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e);
+            eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e);
        }
    }

-    try_stop_storage_broker_process(env);
-
-    Ok(())
-}
-
-fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
    if let Err(e) = broker::stop_broker_process(env) {
-        eprintln!("neon broker stop failed: {e}");
+        eprintln!("neon broker stop failed: {e:#}");
    }
 }

@@ -900,6 +906,7 @@ fn cli() -> Command {
    let stop_mode_arg = Arg::new("stop-mode")
        .short('m')
        .value_parser(["fast", "immediate"])
+        .default_value("fast")
        .help("If 'immediate', don't flush repository data at shutdown")
        .required(false)
        .value_name("stop-mode");
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -17,7 +17,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
        "storage_broker",
        &env.base_data_dir,
        &env.storage_broker_bin(),
-        &args,
+        args,
        [],
        background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
        || {
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -44,7 +44,7 @@ impl ComputeControlPlane {
        let mut nodes = BTreeMap::default();
        let pgdatadirspath = &env.pg_data_dirs_path();

-        for tenant_dir in fs::read_dir(&pgdatadirspath)
+        for tenant_dir in fs::read_dir(pgdatadirspath)
            .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
        {
            let tenant_dir = tenant_dir?;
@@ -67,8 +67,8 @@ impl ComputeControlPlane {
    fn get_port(&mut self) -> u16 {
        1 + self
            .nodes
-            .iter()
-            .map(|(_name, node)| node.address.port())
+            .values()
+            .map(|node| node.address.port())
            .max()
            .unwrap_or(self.base_port)
    }
@@ -183,7 +183,7 @@ impl PostgresNode {

    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
-        let mut cmd = Command::new(&pg_path);
+        let mut cmd = Command::new(pg_path);

        cmd.arg("--sync-safekeepers")
            .env_clear()
@@ -261,7 +261,7 @@ impl PostgresNode {
    }

    fn create_pgdata(&self) -> Result<()> {
-        fs::create_dir_all(&self.pgdata()).with_context(|| {
+        fs::create_dir_all(self.pgdata()).with_context(|| {
            format!(
                "could not create data directory {}",
                self.pgdata().display()
@@ -478,7 +478,7 @@ impl PostgresNode {
                postgresql_conf_path.to_str().unwrap()
            )
        })?;
-        fs::remove_dir_all(&self.pgdata())?;
+        fs::remove_dir_all(self.pgdata())?;
        self.create_pgdata()?;

        // 2. Bring back config files
@@ -514,7 +514,7 @@ impl PostgresNode {
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
-            fs::remove_dir_all(&self.pgdata())?;
+            fs::remove_dir_all(self.pgdata())?;
        } else {
            self.pg_ctl(&["stop"], &None)?;
        }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -404,7 +404,7 @@ impl LocalEnv {
            }
        }

-        fs::create_dir(&base_path)?;
+        fs::create_dir(base_path)?;

        // generate keys for jwt
        // openssl genrsa -out private_key.pem 2048
@@ -413,7 +413,7 @@ impl LocalEnv {
            private_key_path = base_path.join("auth_private_key.pem");
            let keygen_output = Command::new("openssl")
                .arg("genrsa")
-                .args(&["-out", private_key_path.to_str().unwrap()])
+                .args(["-out", private_key_path.to_str().unwrap()])
                .arg("2048")
                .stdout(Stdio::null())
                .output()
@@ -430,10 +430,10 @@ impl LocalEnv {
            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
            let keygen_output = Command::new("openssl")
                .arg("rsa")
-                .args(&["-in", private_key_path.to_str().unwrap()])
+                .args(["-in", private_key_path.to_str().unwrap()])
                .arg("-pubout")
-                .args(&["-outform", "PEM"])
-                .args(&["-out", public_key_path.to_str().unwrap()])
+                .args(["-outform", "PEM"])
+                .args(["-out", public_key_path.to_str().unwrap()])
                .stdout(Stdio::null())
                .output()
                .context("failed to generate auth private key")?;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -241,7 +241,7 @@ impl PageServerNode {
        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
        args.push(Cow::Borrowed("--init"));

-        let init_output = Command::new(&self.env.pageserver_bin())
+        let init_output = Command::new(self.env.pageserver_bin())
            .args(args.iter().map(Cow::as_ref))
            .envs(self.pageserver_env_variables()?)
            .output()
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -163,6 +163,8 @@ pub struct TenantInfo {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub state: TenantState,
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub has_in_progress_downloads: Option<bool>,
 }
@@ -191,9 +193,12 @@ pub struct TimelineInfo {
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
    pub current_logical_size_non_incremental: Option<u64>,
-    pub current_physical_size_non_incremental: Option<u64>,
+
+    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
    #[serde_as(as = "Option<DisplayFromStr>")]
@@ -203,29 +208,22 @@ pub struct TimelineInfo {
    pub pg_version: u32,

    pub state: TimelineState,
-
-    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
-    // compatility with older clients.
-    pub local: LocalTimelineInfo,
-    pub remote: RemoteTimelineInfo,
 }

-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct LocalTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_lsn: Option<Lsn>,
-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
-    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
+pub struct DownloadRemoteLayersTaskInfo {
+    pub task_id: String,
+    pub state: DownloadRemoteLayersTaskState,
+    pub total_layer_count: u64,         // stable once `completed`
+    pub successful_download_count: u64, // stable once `completed`
+    pub failed_download_count: u64,     // stable once `completed`
 }

-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct RemoteTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub remote_consistent_lsn: Option<Lsn>,
+pub enum DownloadRemoteLayersTaskState {
+    Running,
+    Completed,
+    ShutDown,
 }

 pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
@@ -325,7 +323,7 @@ impl PagestreamFeMessage {
        match self {
            Self::Exists(req) => {
                bytes.put_u8(0);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
@@ -335,7 +333,7 @@ impl PagestreamFeMessage {

            Self::Nblocks(req) => {
                bytes.put_u8(1);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
@@ -345,7 +343,7 @@ impl PagestreamFeMessage {

            Self::GetPage(req) => {
                bytes.put_u8(2);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
@@ -356,7 +354,7 @@ impl PagestreamFeMessage {

            Self::DbSize(req) => {
                bytes.put_u8(3);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -8,8 +8,8 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0"
 itertools = "0.10.3"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [dev-dependencies]
 env_logger = "0.9"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 wal_craft = { path = "wal_craft" }

 [build-dependencies]
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -14,8 +14,8 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
        status
    );

-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;

    let bshift: u8 =
        ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
@@ -25,13 +25,13 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
 }

 pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;

    let bshift: u8 =
        ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;

-    ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
+    (page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK
 }

 // See CLOGPagePrecedes in clog.c
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -333,7 +333,7 @@ impl CheckPoint {
 // We need this segment to start compute node.
 //
 pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
-    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize);
+    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);

    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
    let hdr = XLogLongPageHeaderData {
@@ -574,7 +574,7 @@ mod tests {

        // Rename file to partial to actually find last valid lsn, then rename it back.
        fs::rename(
-            cfg.wal_dir().join(&last_segment),
+            cfg.wal_dir().join(last_segment),
            cfg.wal_dir().join(format!("{}.partial", last_segment)),
        )
        .unwrap();
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -11,7 +11,7 @@ clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres_ffi = { path = "../" }
 tempfile = "3.2"
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -81,7 +81,7 @@ impl Conf {
            .new_pg_command("initdb")?
            .arg("-D")
            .arg(self.datadir.as_os_str())
-            .args(&["-U", "postgres", "--no-instructions", "--no-sync"])
+            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
            .output()?;
        debug!("initdb output: {:?}", output);
        ensure!(
@@ -105,12 +105,12 @@ impl Conf {
        let unix_socket_dir_path = unix_socket_dir.path().to_owned();
        let server_process = self
            .new_pg_command("postgres")?
-            .args(&["-c", "listen_addresses="])
+            .args(["-c", "listen_addresses="])
            .arg("-k")
            .arg(unix_socket_dir_path.as_os_str())
            .arg("-D")
            .arg(self.datadir.as_os_str())
-            .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
            .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
            .stderr(Stdio::from(log_file))
            .spawn()?;
@@ -142,7 +142,7 @@ impl Conf {
        );
        let output = self
            .new_pg_command("pg_waldump")?
-            .args(&[
+            .args([
                &first_segment_file.as_os_str(),
                &last_segment_file.as_os_str(),
            ])
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 anyhow = "1.0"
 bytes = "1.0.1"
 pin-project-lite = "0.2.7"
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -444,7 +444,7 @@ impl FeCloseMessage {
 pub enum BeMessage<'a> {
    AuthenticationOk,
    AuthenticationMD5Password([u8; 4]),
-    AuthenticationSasl(SaslMessage<'a>),
+    AuthenticationSasl(BeAuthenticationSaslMessage<'a>),
    AuthenticationCleartextPassword,
    BackendKeyData(CancelKeyData),
    BindComplete,
@@ -463,7 +463,10 @@ pub enum BeMessage<'a> {
    EncryptionResponse(bool),
    NoData,
    ParameterDescription,
-    ParameterStatus(ParameterStatusMessage<'a>),
+    ParameterStatus {
+        name: &'a [u8],
+        value: &'a [u8],
+    },
    ParseComplete,
    ReadyForQuery,
    RowDescription(&'a [RowDescriptor<'a>]),
@@ -472,25 +475,41 @@ pub enum BeMessage<'a> {
    KeepAlive(WalSndKeepAlive),
 }

+/// Common shorthands.
+impl<'a> BeMessage<'a> {
+    /// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8.
+    /// This is a sensible default, given that:
+    ///  * rust strings only support this encoding out of the box.
+    ///  * tokio-postgres, postgres-jdbc (and probably more) mandate it.
+    ///
+    /// TODO: do we need to report `server_encoding` as well?
+    pub const CLIENT_ENCODING: Self = Self::ParameterStatus {
+        name: b"client_encoding",
+        value: b"UTF8",
+    };
+
+    /// Build a [`BeMessage::ParameterStatus`] holding the server version.
+    pub fn server_version(version: &'a str) -> Self {
+        Self::ParameterStatus {
+            name: b"server_version",
+            value: version.as_bytes(),
+        }
+    }
+}
+
 #[derive(Debug)]
-pub enum SaslMessage<'a> {
+pub enum BeAuthenticationSaslMessage<'a> {
    Methods(&'a [&'a str]),
    Continue(&'a [u8]),
    Final(&'a [u8]),
 }

 #[derive(Debug)]
-pub enum ParameterStatusMessage<'a> {
+pub enum BeParameterStatusMessage<'a> {
    Encoding(&'a str),
    ServerVersion(&'a str),
 }

-impl ParameterStatusMessage<'static> {
-    pub fn encoding() -> BeMessage<'static> {
-        BeMessage::ParameterStatus(Self::Encoding("UTF8"))
-    }
-}
-
 // One row description in RowDescription packet.
 #[derive(Debug)]
 pub struct RowDescriptor<'a> {
@@ -587,14 +606,15 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }

 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
-    if s.contains(&0) {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> {
+    let bytes = s.as_ref();
+    if bytes.contains(&0) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidInput,
            "string contains embedded null",
        ));
    }
-    buf.put_slice(s);
+    buf.put_slice(bytes);
    buf.put_u8(0);
    Ok(())
 }
@@ -639,12 +659,12 @@ impl<'a> BeMessage<'a> {
            BeMessage::AuthenticationSasl(msg) => {
                buf.put_u8(b'R');
                write_body(buf, |buf| {
-                    use SaslMessage::*;
+                    use BeAuthenticationSaslMessage::*;
                    match msg {
                        Methods(methods) => {
                            buf.put_i32(10); // Specifies that SASL auth method is used.
                            for method in methods.iter() {
-                                write_cstr(method.as_bytes(), buf)?;
+                                write_cstr(method, buf)?;
                            }
                            buf.put_u8(0); // zero terminator for the list
                        }
@@ -759,7 +779,7 @@ impl<'a> BeMessage<'a> {
                    buf.put_slice(b"CXX000\0");

                    buf.put_u8(b'M'); // the message
-                    write_cstr(error_msg.as_bytes(), buf)?;
+                    write_cstr(error_msg, buf)?;

                    buf.put_u8(0); // terminator
                    Ok::<_, io::Error>(())
@@ -799,24 +819,12 @@ impl<'a> BeMessage<'a> {
                buf.put_u8(response);
            }

-            BeMessage::ParameterStatus(param) => {
-                use std::io::{IoSlice, Write};
-                use ParameterStatusMessage::*;
-
-                let [name, value] = match param {
-                    Encoding(name) => [b"client_encoding", name.as_bytes()],
-                    ServerVersion(version) => [b"server_version", version.as_bytes()],
-                };
-
-                // Parameter names and values are passed as null-terminated strings
-                let iov = &mut [name, b"\0", value, b"\0"].map(IoSlice::new);
-                let mut buffer = [0u8; 64]; // this should be enough
-                let cnt = buffer.as_mut().write_vectored(iov).unwrap();
-
+            BeMessage::ParameterStatus { name, value } => {
                buf.put_u8(b'S');
                write_body(buf, |buf| {
-                    buf.put_slice(&buffer[..cnt]);
-                });
+                    write_cstr(name, buf)?;
+                    write_cstr(value, buf)
+                })?;
            }

            BeMessage::ParameterDescription => {
@@ -873,7 +881,7 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(b'k');
                    buf.put_u64(req.sent_ptr);
                    buf.put_i64(req.timestamp);
-                    buf.put_u8(if req.request_reply { 1 } else { 0 });
+                    buf.put_u8(u8::from(req.request_reply));
                });
            }
        }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -7,6 +7,7 @@
 //!
 mod local_fs;
 mod s3_bucket;
+mod simulate_failures;

 use std::{
    collections::HashMap,
@@ -24,7 +25,7 @@ use tokio::io;
 use toml_edit::Item;
 use tracing::info;

-pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};

 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -77,7 +78,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
    /// so this method doesnt need to.
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -150,6 +154,7 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
+    Unreliable(Arc<UnreliableWrapper>),
 }

 impl Deref for GenericRemoteStorage {
@@ -159,27 +164,30 @@ impl Deref for GenericRemoteStorage {
        match self {
            GenericRemoteStorage::LocalFs(local_fs) => local_fs,
            GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
+            GenericRemoteStorage::Unreliable(s) => s.as_ref(),
        }
    }
 }

 impl GenericRemoteStorage {
-    pub fn from_config(
-        storage_config: &RemoteStorageConfig,
-    ) -> anyhow::Result<GenericRemoteStorage> {
+    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs(root) => {
                info!("Using fs root '{}' as a remote storage", root.display());
-                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
+                Self::LocalFs(LocalFs::new(root.clone())?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
        })
    }

+    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    }
+
    /// Takes storage object contents and its size and uploads to remote storage,
    /// mapping `from_path` to the corresponding remote object id in the storage.
    ///
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -92,13 +92,17 @@ impl RemoteStorage for LocalFs {
            .collect())
    }

-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        let path = match prefix {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
        };
        Ok(get_all_files(path.as_ref(), false)
-            .await?
+            .await
+            .map_err(DownloadError::Other)?
            .into_iter()
            .map(|path| {
                path.strip_prefix(&self.storage_root)
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,14 +4,13 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::env::var;
 use std::sync::Arc;
-use std::time::Duration;

 use anyhow::Context;
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider, imds,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
 };
 use aws_sdk_s3::{
    config::Config,
@@ -20,7 +19,6 @@ use aws_sdk_s3::{
    Client, Endpoint, Region,
 };
 use aws_smithy_http::body::SdkBody;
-use aws_types::credentials::{CredentialsError, ProvideCredentials};
 use hyper::Body;
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
@@ -31,8 +29,6 @@ use crate::{
    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub(super) mod metrics {
    use metrics::{register_int_counter_vec, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -122,30 +118,23 @@ impl S3Bucket {
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            // uses imds v2
+            let imds = ImdsCredentialsProvider::builder().build();
+
+            // finally add caching.
+            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
+            LazyCachingCredentialsProvider::builder()
+                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
+                .build()
+        };
+
        let mut config_builder = Config::builder()
            .region(Region::new(aws_config.bucket_region.clone()))
-            .credentials_provider(provide_credentials_fn(|| async {
-                match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
-                    true => {
-                        EnvironmentVariableCredentialsProvider::new()
-                            .provide_credentials()
-                            .await
-                    }
-                    false => {
-                        let imds_client = imds::Client::builder()
-                            .connect_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .read_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .build()
-                            .await
-                            .map_err(CredentialsError::unhandled)?;
-                        ImdsCredentialsProvider::builder()
-                            .imds_client(imds_client)
-                            .build()
-                            .provide_credentials()
-                            .await
-                    }
-                }
-            }));
+            .credentials_provider(credentials_provider);

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            let endpoint = Endpoint::immutable(
@@ -297,7 +286,10 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_prefixes`
    /// Note: it wont include empty "directories"
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
@@ -319,7 +311,8 @@ impl RemoteStorage for S3Bucket {
                .concurrency_limiter
                .acquire()
                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")?;
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;

            metrics::inc_list_objects();

@@ -335,7 +328,9 @@ impl RemoteStorage for S3Bucket {
                .map_err(|e| {
                    metrics::inc_list_objects_fail();
                    e
-                })?;
+                })
+                .context("Failed to list S3 prefixes")
+                .map_err(DownloadError::Other)?;

            document_keys.extend(
                fetch_response
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -0,0 +1,129 @@
+//! This module provides a wrapper around a real RemoteStorage implementation that
+//! causes the first N attempts at each upload or download operatio to fail. For
+//! testing purposes.
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
+
+pub struct UnreliableWrapper {
+    inner: crate::GenericRemoteStorage,
+
+    // This many attempts of each operation will fail, then we let it succeed.
+    attempts_to_fail: u64,
+
+    // Tracks how many failed attempts of each operation has been made.
+    attempts: Mutex<HashMap<RemoteOp, u64>>,
+}
+
+/// Used to identify retries of different unique operation.
+#[derive(Debug, Hash, Eq, PartialEq)]
+enum RemoteOp {
+    List,
+    ListPrefixes(Option<RemotePath>),
+    Upload(RemotePath),
+    Download(RemotePath),
+    Delete(RemotePath),
+}
+
+impl UnreliableWrapper {
+    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+        assert!(attempts_to_fail > 0);
+        UnreliableWrapper {
+            inner,
+            attempts_to_fail,
+            attempts: Mutex::new(HashMap::new()),
+        }
+    }
+
+    ///
+    /// Common functionality for all operations.
+    ///
+    /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
+    /// attempts, let the operation go ahead, and clear the counter.
+    ///
+    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
+        let mut attempts = self.attempts.lock().unwrap();
+
+        match attempts.entry(op) {
+            Entry::Occupied(mut e) => {
+                let attempts_before_this = {
+                    let p = e.get_mut();
+                    *p += 1;
+                    *p
+                };
+
+                if attempts_before_this >= self.attempts_to_fail {
+                    // let it succeed
+                    e.remove();
+                    Ok(attempts_before_this)
+                } else {
+                    let error =
+                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                    Err(DownloadError::Other(error))
+                }
+            }
+            Entry::Vacant(e) => {
+                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                e.insert(1);
+                Err(DownloadError::Other(error))
+            }
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for UnreliableWrapper {
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::List)?;
+        self.inner.list().await
+    }
+
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list_prefixes(prefix).await
+    }
+
+    async fn upload(
+        &self,
+        data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        // S3 PUT request requires the content length to be specified,
+        // otherwise it starts to fail with the concurrent connection count increasing.
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.upload(data, data_size_bytes, to, metadata).await
+    }
+
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner.download(from).await
+    }
+
+    async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        // Note: We treat any download_byte_range as an "attempt" of the same
+        // operation. We don't pay attention to the ranges. That's good enough
+        // for now.
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner
+            .download_byte_range(from, start_inclusive, end_exclusive)
+            .await
+    }
+
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Delete(path.clone()))?;
+        self.inner.delete(path).await
+    }
+}
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -157,34 +157,34 @@ mod tests {
        assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);

        let invalid_dir_path = file_path.join("folder");
-        create_dir_all(&invalid_dir_path).unwrap_err();
+        create_dir_all(invalid_dir_path).unwrap_err();
    }

    #[test]
    fn test_path_with_suffix_extension() {
        let p = PathBuf::from("/foo/bar");
        assert_eq!(
-            &path_with_suffix_extension(&p, "temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp").to_string_lossy(),
            "/foo/bar.temp"
        );
        let p = PathBuf::from("/foo/bar");
        assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
            "/foo/bar.temp.temp"
        );
        let p = PathBuf::from("/foo/bar.baz");
        assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
            "/foo/bar.baz.temp.temp"
        );
        let p = PathBuf::from("/foo/bar.baz");
        assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
            "/foo/bar.baz..temp"
        );
        let p = PathBuf::from("/foo/bar/dir/");
        assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
            "/foo/bar/dir..temp"
        );
    }
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -6,7 +6,7 @@
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ParameterStatusMessage};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
@@ -361,11 +361,9 @@ impl PostgresBackend {
                        match self.auth_type {
                            AuthType::Trust => {
                                self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                                    .write_message_noflush(&ParameterStatusMessage::encoding())?
+                                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                                    // The async python driver requires a valid server_version
-                                    .write_message_noflush(&BeMessage::ParameterStatus(
-                                        ParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message_noflush(&BeMessage::server_version("14.1"))?
                                    .write_message(&BeMessage::ReadyForQuery)?;
                                self.state = ProtoState::Established;
                            }
@@ -413,7 +411,7 @@ impl PostgresBackend {
                    }
                }
                self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                    .write_message_noflush(&ParameterStatusMessage::encoding())?
+                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                    .write_message(&BeMessage::ReadyForQuery)?;
                self.state = ProtoState::Established;
            }
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -6,7 +6,7 @@
 use crate::postgres_backend::AuthType;
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ParameterStatusMessage};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
@@ -331,11 +331,9 @@ impl PostgresBackend {
                        match self.auth_type {
                            AuthType::Trust => {
                                self.write_message(&BeMessage::AuthenticationOk)?
-                                    .write_message(&ParameterStatusMessage::encoding())?
+                                    .write_message(&BeMessage::CLIENT_ENCODING)?
                                    // The async python driver requires a valid server_version
-                                    .write_message(&BeMessage::ParameterStatus(
-                                        ParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message(&BeMessage::server_version("14.1"))?
                                    .write_message(&BeMessage::ReadyForQuery)?;
                                self.state = ProtoState::Established;
                            }
@@ -384,7 +382,7 @@ impl PostgresBackend {
                    }
                }
                self.write_message(&BeMessage::AuthenticationOk)?
-                    .write_message(&ParameterStatusMessage::encoding())?
+                    .write_message(&BeMessage::CLIENT_ENCODING)?
                    .write_message(&BeMessage::ReadyForQuery)?;
                self.state = ProtoState::Established;
            }
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -11,11 +11,13 @@ use tokio::time::timeout;

 /// An error happened while waiting for a number
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
-#[error("SeqWaitError")]
 pub enum SeqWaitError {
    /// The wait timeout was reached
+    #[error("seqwait timeout was reached")]
    Timeout,
+
    /// [`SeqWait::shutdown`] was called
+    #[error("SeqWait::shutdown was called")]
    Shutdown,
 }

--- a/libs/utils/src/sock_split.rs
+++ b/libs/utils/src/sock_split.rs
@@ -50,7 +50,7 @@ impl BufStream {

    /// Returns a reference to the underlying TcpStream.
    fn get_ref(&self) -> &TcpStream {
-        &*self.0.get_ref().0
+        &self.0.get_ref().0
    }
 }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -18,7 +18,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 byteorder = "1.4.3"
 bytes = "1.0.1"
-chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
+chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4.0", features = ["string"] }
 close_fds = "0.3.2"
 const_format = "0.2.21"
@@ -36,23 +36,23 @@ nix = "0.25"
 num-traits = "0.2.15"
 once_cell = "1.13.0"
 pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
 rstar = "0.9.3"
 scopeguard = "1.1.0"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1"
+serde_json = { version = "1.0", features = ["raw_value"] }
 serde_with = "2.0"
 signal-hook = "0.3.10"
 svg_fmt = "0.4.1"
 tar = "0.4.33"
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.36"
@@ -69,6 +69,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
 tenant_size_model = { path = "../libs/tenant_size_model" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+reqwest = "0.11.13"

 [dev-dependencies]
 criterion = "0.4"
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,8 +1,7 @@
 use anyhow::Result;
 use pageserver::repository::Key;
-use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::ValueReconstructState;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
 use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
@@ -163,7 +162,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
    c.bench_function("captest_uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
            }
        });
    });
@@ -192,7 +191,7 @@ fn bench_from_real_project(c: &mut Criterion) {
    c.bench_function("real_map_uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
            }
        });
    });
@@ -238,7 +237,7 @@ fn bench_sequential(c: &mut Criterion) {
        // Run the search queries
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
            }
        });
    });
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -84,7 +84,7 @@ fn add_multithreaded_walredo_requesters(

                            barrier.wait();

-                            execute_all(input, &*manager).unwrap();
+                            execute_all(input, &manager).unwrap();

                            barrier.wait();
                        }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,7 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;

-use crate::tenant::Timeline;
+use crate::task_mgr;
+use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
 use pageserver_api::reltag::{RelTag, SlruKind};

 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -130,7 +131,7 @@ where

        // Create pgdata subdirs structure
        for dir in PGDATA_SUBDIRS.iter() {
-            let header = new_tar_header_dir(*dir)?;
+            let header = new_tar_header_dir(dir)?;
            self.ar.append(&header, &mut io::empty())?;
        }

@@ -152,23 +153,29 @@ where
            SlruKind::MultiXactOffsets,
            SlruKind::MultiXactMembers,
        ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
+            for segno in
+                with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
+            {
                self.add_slru_segment(kind, segno)?;
            }
        }

        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
+        for ((spcnode, dbnode), has_relmap_file) in
+            with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
+        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file)?;

            // Gather and send relational files in each database if full backup is requested.
            if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
+                for rel in with_ondemand_download_sync(|| {
+                    self.timeline.list_rels(spcnode, dbnode, self.lsn)
+                })? {
                    self.add_rel(rel)?;
                }
            }
        }
-        for xid in self.timeline.list_twophase_files(self.lsn)? {
+        for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
            self.add_twophase_file(xid)?;
        }

@@ -185,7 +192,8 @@ where
    }

    fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;
+        let nblocks =
+            with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;

        // Function that adds relation segment data to archive
        let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
@@ -208,7 +216,8 @@ where
            for blknum in blocks {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                    .no_ondemand_download()?;
                segment_data.extend_from_slice(&img[..]);
            }

@@ -222,13 +231,16 @@ where
    // Generate SLRU segment files from repository.
    //
    fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
+        let nblocks = with_ondemand_download_sync(|| {
+            self.timeline.get_slru_segment_size(slru, segno, self.lsn)
+        })?;

        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
        for blknum in 0..nblocks {
-            let img = self
-                .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
+            let img = with_ondemand_download_sync(|| {
+                self.timeline
+                    .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+            })?;

            if slru == SlruKind::Clog {
                ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -260,7 +272,9 @@ where
        has_relmap_file: bool,
    ) -> anyhow::Result<()> {
        let relmap_img = if has_relmap_file {
-            let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
+            let img = with_ondemand_download_sync(|| {
+                self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
+            })?;
            ensure!(img.len() == 512);
            Some(img)
        } else {
@@ -295,7 +309,8 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)?
+                    .list_rels(spcnode, dbnode, self.lsn)
+                    .no_ondemand_download()?
                    .is_empty()
            {
                return Ok(());
@@ -327,7 +342,7 @@ where
    // Extract twophase state files
    //
    fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn)?;
+        let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -361,14 +376,12 @@ where
            zenith_signal.as_bytes(),
        )?;

-        let checkpoint_bytes = self
-            .timeline
-            .get_checkpoint(self.lsn)
-            .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_control_file(self.lsn)
-            .context("failed get control bytes")?;
+        let checkpoint_bytes =
+            with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
+                .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes =
+            with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
+                .context("failed get control bytes")?;

        let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
            &pg_control_bytes,
@@ -490,3 +503,11 @@ where
        }
    }
 }
+
+fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
+where
+    F: Send + Fn() -> PageReconstructResult<T>,
+    T: Send,
+{
+    task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
+}
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/billing_metrics.rs
@@ -0,0 +1,283 @@
+//!
+//! Periodically collect consumption metrics for all active tenants
+//! and push them to a HTTP endpoint.
+//! Cache metrics to send only the updated ones.
+//!
+
+use anyhow;
+use tracing::*;
+use utils::id::TimelineId;
+
+use crate::task_mgr;
+use crate::tenant::mgr;
+use pageserver_api::models::TenantState;
+use utils::id::TenantId;
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fmt;
+use std::str::FromStr;
+use std::time::Duration;
+
+use chrono::{DateTime, Utc};
+use reqwest::Url;
+
+/// BillingMetric struct that defines the format for one metric entry
+/// i.e.
+///
+/// ```json
+/// {
+/// "metric": "remote_storage_size",
+/// "type": "absolute",
+/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
+/// "timeline_id": "00000000000000000000000000000000",
+/// "time": ...,
+/// "value": 12345454,
+/// }
+/// ```
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct BillingMetric {
+    pub metric: BillingMetricKind,
+    pub metric_type: &'static str,
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub time: DateTime<Utc>,
+    pub value: u64,
+}
+
+impl BillingMetric {
+    pub fn new_absolute(
+        metric: BillingMetricKind,
+        tenant_id: TenantId,
+        timeline_id: Option<TimelineId>,
+        value: u64,
+    ) -> Self {
+        Self {
+            metric,
+            metric_type: "absolute",
+            tenant_id,
+            timeline_id,
+            time: Utc::now(),
+            value,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum BillingMetricKind {
+    /// Amount of WAL produced , by a timeline, i.e. last_record_lsn
+    /// This is an absolute, per-timeline metric.
+    WrittenSize,
+    /// Size of all tenant branches including WAL
+    /// This is an absolute, per-tenant metric.
+    /// This is the same metric that tenant/tenant_id/size endpoint returns.
+    SyntheticStorageSize,
+    /// Size of all the layer files in the tenant's directory on disk on the pageserver.
+    /// This is an absolute, per-tenant metric.
+    /// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
+    ResidentSize,
+    /// Size of the remote storage (S3) directory.
+    /// This is an absolute, per-tenant metric.
+    RemoteStorageSize,
+}
+
+impl FromStr for BillingMetricKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "written_size" => Ok(Self::WrittenSize),
+            "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
+            "resident_size" => Ok(Self::ResidentSize),
+            "remote_storage_size" => Ok(Self::RemoteStorageSize),
+            _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
+        }
+    }
+}
+
+impl fmt::Display for BillingMetricKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(match self {
+            BillingMetricKind::WrittenSize => "written_size",
+            BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
+            BillingMetricKind::ResidentSize => "resident_size",
+            BillingMetricKind::RemoteStorageSize => "remote_storage_size",
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct BillingMetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: BillingMetricKind,
+}
+
+#[derive(serde::Serialize)]
+struct EventChunk<'a> {
+    events: &'a [BillingMetric],
+}
+
+/// Main thread that serves metrics collection
+pub async fn collect_metrics(
+    metric_collection_endpoint: &Url,
+    metric_collection_interval: Duration,
+) -> anyhow::Result<()> {
+    let mut ticker = tokio::time::interval(metric_collection_interval);
+
+    info!("starting collect_metrics");
+
+    // define client here to reuse it for all requests
+    let client = reqwest::Client::new();
+    let mut cached_metrics: HashMap<BillingMetricsKey, u64> = HashMap::new();
+
+    loop {
+        tokio::select! {
+            _ = task_mgr::shutdown_watcher() => {
+                info!("collect_metrics received cancellation request");
+                return Ok(());
+            },
+            _ = ticker.tick() => {
+                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint).await?;
+            }
+        }
+    }
+}
+
+/// One iteration of metrics collection
+///
+/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
+/// Cache metrics to avoid sending the same metrics multiple times.
+pub async fn collect_metrics_task(
+    client: &reqwest::Client,
+    cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
+    metric_collection_endpoint: &reqwest::Url,
+) -> anyhow::Result<()> {
+    let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
+    trace!(
+        "starting collect_metrics_task. metric_collection_endpoint: {}",
+        metric_collection_endpoint
+    );
+
+    // get list of tenants
+    let tenants = mgr::list_tenants().await;
+
+    // iterate through list of Active tenants and collect metrics
+    for (tenant_id, tenant_state) in tenants {
+        if tenant_state != TenantState::Active {
+            continue;
+        }
+
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
+
+        let mut tenant_resident_size = 0;
+
+        // iterate through list of timelines in tenant
+        for timeline in tenant.list_timelines().iter() {
+            let timeline_written_size = u64::from(timeline.get_last_record_lsn());
+
+            current_metrics.push((
+                BillingMetricsKey {
+                    tenant_id,
+                    timeline_id: Some(timeline.timeline_id),
+                    metric: BillingMetricKind::WrittenSize,
+                },
+                timeline_written_size,
+            ));
+
+            let timeline_resident_size = timeline.get_resident_physical_size();
+            tenant_resident_size += timeline_resident_size;
+
+            debug!(
+                "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
+                tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
+        }
+
+        let tenant_remote_size = tenant.get_remote_size().await?;
+        debug!(
+            "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
+            tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
+        );
+
+        current_metrics.push((
+            BillingMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: BillingMetricKind::ResidentSize,
+            },
+            tenant_resident_size,
+        ));
+
+        current_metrics.push((
+            BillingMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: BillingMetricKind::RemoteStorageSize,
+            },
+            tenant_remote_size,
+        ));
+
+        // TODO add SyntheticStorageSize metric
+    }
+
+    // Filter metrics
+    current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+        Some(val) => val != curr_val,
+        None => true,
+    });
+
+    if current_metrics.is_empty() {
+        trace!("no new metrics to send");
+        return Ok(());
+    }
+
+    // Send metrics.
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    const CHUNK_SIZE: usize = 1000;
+    let chunks = current_metrics.chunks(CHUNK_SIZE);
+
+    let mut chunk_to_send: Vec<BillingMetric> = Vec::with_capacity(1000);
+
+    for chunk in chunks {
+        chunk_to_send.clear();
+        // enrich metrics with timestamp and metric_kind before sending
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
+            BillingMetric::new_absolute(
+                curr_key.metric,
+                curr_key.tenant_id,
+                curr_key.timeline_id,
+                *curr_val,
+            )
+        }));
+
+        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
+            events: &chunk_to_send,
+        })
+        .expect("BillingMetric should not fail serialization");
+
+        let res = client
+            .post(metric_collection_endpoint.clone())
+            .json(&chunk_json)
+            .send()
+            .await;
+
+        match res {
+            Ok(res) => {
+                if res.status().is_success() {
+                    // update cached metrics after they were sent successfully
+                    for (curr_key, curr_val) in chunk.iter() {
+                        cached_metrics.insert(curr_key.clone(), *curr_val);
+                    }
+                } else {
+                    error!("metrics endpoint refused the sent metrics: {:?}", res);
+                }
+            }
+            Err(err) => {
+                error!("failed to send metrics: {:?}", err);
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,6 +7,7 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
+use remote_storage::GenericRemoteStorage;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -17,9 +18,9 @@ use pageserver::{
    task_mgr::{
        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
    },
-    tenant_mgr, virtual_file,
+    tenant::mgr,
+    virtual_file,
 };
-use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
    logging,
@@ -127,7 +128,7 @@ fn initialize_config(
            );
        }
        // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
+        let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
            format!(
                "Failed to read pageserver config at '{}'",
                cfg_file_path.display()
@@ -181,7 +182,7 @@ fn initialize_config(
    if update_config {
        info!("Writing pageserver config to '{}'", cfg_file_path.display());

-        std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
+        std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
            format!(
                "Failed to write pageserver config to '{}'",
                cfg_file_path.display()
@@ -201,8 +202,12 @@ fn initialize_config(
 }

 fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    // Initialize logging
    logging::init(conf.log_format)?;
+
+    // Print version to the log, and expose it as a prometheus metric too.
    info!("version: {}", version());
+    set_build_info_metric(GIT_VERSION);

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
@@ -218,38 +223,37 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
        )
    }

+    // Create and lock PID file. This ensures that there cannot be more than one
+    // pageserver process running at the same time.
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file =
        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
    info!("Claimed pid file at {lock_file_path:?}");

-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
+    // Ensure that the lock file is held even if the main thread of the process panics.
+    // We need to release the lock file only when the process exits.
    std::mem::forget(lock_file);

-    // TODO: Check that it looks like a valid repository before going further
+    // Bind the HTTP and libpq ports early, so that if they are in use by some other
+    // process, we error out early.
+    let http_addr = &conf.listen_http_addr;
+    info!("Starting pageserver http handler on {http_addr}");
+    let http_listener = tcp_listener::bind(http_addr)?;

-    // bind sockets before daemonizing so we report errors early and do not return until we are listening
-    info!(
-        "Starting pageserver http handler on {}",
-        conf.listen_http_addr
-    );
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
-
-    info!(
-        "Starting pageserver pg protocol handler on {}",
-        conf.listen_pg_addr
-    );
-    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
+    let pg_addr = &conf.listen_pg_addr;
+    info!("Starting pageserver pg protocol handler on {pg_addr}");
+    let pageserver_listener = tcp_listener::bind(pg_addr)?;

+    // Install signal handlers
    let signals = signals::install_shutdown_handlers()?;

-    // start profiler (if enabled)
+    // Start profiler (if enabled)
    let profiler_guard = profiling::init_profiler(conf);

+    // Launch broker client
    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;

-    // initialize authentication for incoming connections
+    // Initialize authentication for incoming connections
    let auth = match &conf.auth_type {
        AuthType::Trust | AuthType::MD5 => None,
        AuthType::NeonJWT => {
@@ -277,37 +281,21 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
        }
    };

-    let remote_storage = conf
-        .remote_storage_config
-        .as_ref()
-        .map(GenericRemoteStorage::from_config)
-        .transpose()
-        .context("Failed to init generic remote storage")?;
+    // Set up remote storage client
+    let remote_storage = create_remote_storage_client(conf)?;

-    let (init_result_sender, init_result_receiver) =
-        std::sync::mpsc::channel::<anyhow::Result<()>>();
-    let storage_for_spawn = remote_storage.clone();
-    let _handler = BACKGROUND_RUNTIME.spawn(async move {
-        let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
-        init_result_sender.send(result)
-    });
-    match init_result_receiver.recv() {
-        Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
-        Err(_sender_dropped_err) => {
-            anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
-        }
-    }
+    // Scan the local 'tenants/' directory and start loading the tenants
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;

-    // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
-    // bind before launching separate thread so the error reported before startup exits
-
-    // Create a Service from the router above to handle incoming requests.
+    // Start up the service to handle HTTP mgmt API request. We created the
+    // listener earlier already.
    {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

-        let router = http::make_router(conf, auth.clone(), remote_storage)?;
-        let service =
-            utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
+        let router = http::make_router(conf, auth.clone(), remote_storage)?
+            .build()
+            .map_err(|err| anyhow!(err))?;
+        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
            .with_graceful_shutdown(task_mgr::shutdown_watcher());
@@ -324,10 +312,30 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                Ok(())
            },
        );
+
+        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            task_mgr::spawn(
+                MGMT_REQUEST_RUNTIME.handle(),
+                TaskKind::MetricsCollection,
+                None,
+                None,
+                "consumption metrics collection",
+                true,
+                async move {
+                    pageserver::billing_metrics::collect_metrics(
+                        metric_collection_endpoint,
+                        conf.metric_collection_interval,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                },
+            );
+        }
    }

    // Spawn a task to listen for libpq connections. It will spawn further tasks
-    // for each connection.
+    // for each connection. We created the listener earlier already.
    task_mgr::spawn(
        COMPUTE_REQUEST_RUNTIME.handle(),
        TaskKind::LibpqEndpointListener,
@@ -340,8 +348,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
        },
    );

-    set_build_info_metric(GIT_VERSION);
-
    // All started up! Now just sit and wait for shutdown signal.
    signals.handle(|signal| match signal {
        Signal::Quit => {
@@ -365,6 +371,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    })
 }

+fn create_remote_storage_client(
+    conf: &'static PageServerConf,
+) -> anyhow::Result<Option<GenericRemoteStorage>> {
+    let config = if let Some(config) = &conf.remote_storage_config {
+        config
+    } else {
+        // No remote storage configured.
+        return Ok(None);
+    };
+
+    // Create the client
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+
+    // If `test_remote_failures` is non-zero, wrap the client with a
+    // wrapper that simulates failures.
+    if conf.test_remote_failures > 0 {
+        if !cfg!(feature = "testing") {
+            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
+        }
+        info!(
+            "Simulating remote failures for first {} attempts of each op",
+            conf.test_remote_failures
+        );
+        remote_storage =
+            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+    }
+
+    Ok(Some(remote_storage))
+}
+
 fn cli() -> Command {
    Command::new("Neon page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
 }

 fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
    println!("{control_file:?}");
    let control_file_initdb = Lsn(control_file.checkPoint);
    println!(
@@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> {
 }

 fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(&path)?;
+    let metadata_bytes = std::fs::read(path)?;
    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
    println!("Current metadata:\n{meta:?}");
    let mut update_meta = false;
@@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an

    if update_meta {
        let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(&path, &metadata_bytes)?;
+        std::fs::write(path, metadata_bytes)?;
    }

    Ok(())
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,6 +12,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;

 use once_cell::sync::OnceCell;
+use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
@@ -26,14 +27,15 @@ use utils::{
    postgres_backend::AuthType,
 };

+use crate::tenant::config::TenantConf;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
-use crate::tenant_config::{TenantConf, TenantConfOpt};
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
-    use crate::tenant_config::defaults::*;
+    use crate::tenant::config::defaults::*;
    use const_format::formatcp;

    pub use pageserver_api::{
@@ -55,6 +57,8 @@ pub mod defaults {
    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    ///
    /// Default built-in configuration file.
    ///
@@ -78,6 +82,8 @@ pub mod defaults {

 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'

+#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -137,11 +143,18 @@ pub struct PageServerConf {

    /// Storage broker endpoints to connect to.
    pub broker_endpoint: Uri,
+    pub broker_keepalive_interval: Duration,

    pub log_format: LogFormat,

    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
+
+    // How often to collect metrics and send them to the metrics endpoint.
+    pub metric_collection_interval: Duration,
+    pub metric_collection_endpoint: Option<Url>,
+
+    pub test_remote_failures: u64,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -215,10 +228,16 @@ struct PageServerConfigBuilder {

    profiling: BuilderValue<ProfilingConfig>,
    broker_endpoint: BuilderValue<Uri>,
+    broker_keepalive_interval: BuilderValue<Duration>,

    log_format: BuilderValue<LogFormat>,

    concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
+
+    metric_collection_interval: BuilderValue<Duration>,
+    metric_collection_endpoint: BuilderValue<Option<Url>>,
+
+    test_remote_failures: BuilderValue<u64>,
 }

 impl Default for PageServerConfigBuilder {
@@ -247,9 +266,20 @@ impl Default for PageServerConfigBuilder {
            broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                .parse()
                .expect("failed to parse default broker endpoint")),
+            broker_keepalive_interval: Set(humantime::parse_duration(
+                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
+            )
+            .expect("cannot parse default keepalive interval")),
            log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),

            concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
+            metric_collection_interval: Set(humantime::parse_duration(
+                DEFAULT_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default metric collection interval")),
+            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
+
+            test_remote_failures: Set(0),
        }
    }
 }
@@ -310,6 +340,10 @@ impl PageServerConfigBuilder {
        self.broker_endpoint = BuilderValue::Set(broker_endpoint)
    }

+    pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
+        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
+    }
+
    pub fn id(&mut self, node_id: NodeId) {
        self.id = BuilderValue::Set(node_id)
    }
@@ -326,6 +360,18 @@ impl PageServerConfigBuilder {
        self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
    }

+    pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
+        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
+    }
+
+    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
+        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
+    }
+
+    pub fn test_remote_failures(&mut self, fail_first: u64) {
+        self.test_remote_failures = BuilderValue::Set(fail_first);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        Ok(PageServerConf {
            listen_pg_addr: self
@@ -365,12 +411,24 @@ impl PageServerConfigBuilder {
            broker_endpoint: self
                .broker_endpoint
                .ok_or(anyhow!("No broker endpoints provided"))?,
+            broker_keepalive_interval: self
+                .broker_keepalive_interval
+                .ok_or(anyhow!("No broker keepalive interval provided"))?,
            log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
            concurrent_tenant_size_logical_size_queries: self
                .concurrent_tenant_size_logical_size_queries
                .ok_or(anyhow!(
                    "missing concurrent_tenant_size_logical_size_queries"
                ))?,
+            metric_collection_interval: self
+                .metric_collection_interval
+                .ok_or(anyhow!("missing metric_collection_interval"))?,
+            metric_collection_endpoint: self
+                .metric_collection_endpoint
+                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
+            test_remote_failures: self
+                .test_remote_failures
+                .ok_or(anyhow!("missing test_remote_failuers"))?,
        })
    }
 }
@@ -532,6 +590,7 @@ impl PageServerConf {
                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
+                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                "log_format" => builder.log_format(
                    LogFormat::from_config(&parse_toml_string(key, item)?)?
                ),
@@ -541,6 +600,13 @@ impl PageServerConf {
                    let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
                    ConfigurableSemaphore::new(permits)
                }),
+                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
+                "metric_collection_endpoint" => {
+                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
+                    builder.metric_collection_endpoint(Some(endpoint));
+                },
+
+                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -657,10 +723,14 @@ impl PageServerConf {
            auth_validation_public_key_path: None,
            remote_storage_config: None,
            profiling: ProfilingConfig::Disabled,
-            default_tenant_conf: TenantConf::dummy_conf(),
+            default_tenant_conf: TenantConf::default(),
            broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+            broker_keepalive_interval: Duration::from_secs(5000),
            log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
            concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+            metric_collection_interval: Duration::from_secs(60),
+            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+            test_remote_failures: 0,
        }
    }
 }
@@ -791,6 +861,8 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10

+metric_collection_interval = '222 s'
+metric_collection_endpoint = 'http://localhost:80/metrics'
 log_format = 'json'

 "#;
@@ -829,8 +901,16 @@ log_format = 'json'
                profiling: ProfilingConfig::Disabled,
                default_tenant_conf: TenantConf::default(),
                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: humantime::parse_duration(
+                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
+                )?,
                log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: humantime::parse_duration(
+                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
+                )?,
+                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+                test_remote_failures: 0,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -872,8 +952,12 @@ log_format = 'json'
                profiling: ProfilingConfig::Disabled,
                default_tenant_conf: TenantConf::default(),
                broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: Duration::from_secs(5),
                log_format: LogFormat::Json,
                concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: Duration::from_secs(222),
+                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
+                test_remote_failures: 0,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -77,16 +77,6 @@ paths:
        schema:
          type: string
          format: hex
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
-      - name: include-non-incremental-physical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_physical_size_non_incremental
    get:
      description: Get timelines for tenant
      responses:
@@ -139,17 +129,6 @@ paths:
          format: hex
    get:
      description: Get info about the timeline
-      parameters:
-        - name: include-non-incremental-logical-size
-          in: query
-          schema:
-            type: string
-          description: Controls calculation of current_logical_size_non_incremental
-        - name: include-non-incremental-physical-size
-          in: query
-          schema:
-            type: string
-            description: Controls calculation of current_physical_size_non_incremental
      responses:
        "200":
          description: TimelineInfo
@@ -779,10 +758,6 @@ components:
          type: integer
        current_physical_size:
          type: integer
-        current_logical_size_non_incremental:
-          type: integer
-        current_physical_size_non_incremental:
-          type: integer
        wal_source_connstr:
          type: string
        last_received_msg_lsn:
@@ -795,37 +770,6 @@ components:
        latest_gc_cutoff_lsn:
          type: string
          format: hex
-
-        # These 'local' and 'remote' fields just duplicate some of the fields
-        # above. They are kept for backwards-compatibility. They can be removed,
-        # when the control plane has been updated to look at the above fields
-        # directly.
-        local:
-          $ref: "#/components/schemas/LocalTimelineInfo"
-        remote:
-          $ref: "#/components/schemas/RemoteTimelineInfo"
-
-    LocalTimelineInfo:
-      type: object
-      properties:
-        ancestor_timeline_id:
-          type: string
-          format: hex
-        ancestor_lsn:
-          type: string
-          format: hex
-        current_logical_size:
-          type: integer
-        current_physical_size:
-          type: integer
-    RemoteTimelineInfo:
-      type: object
-      required:
-        - remote_consistent_lsn
-      properties:
-        remote_consistent_lsn:
-          type: string
-          format: hex
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4,16 +4,17 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use remote_storage::GenericRemoteStorage;
+use tokio_util::sync::CancellationToken;
 use tracing::*;

 use super::models::{
-    LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
-    TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::tenant::Timeline;
-use crate::tenant_config::TenantConfOpt;
-use crate::{config::PageServerConf, tenant_mgr};
+use crate::tenant::config::TenantConfOpt;
+use crate::tenant::{with_ondemand_download, Timeline};
+use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
    auth::JwtAuth,
    http::{
@@ -30,8 +31,6 @@ use utils::{
 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
 use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
-#[cfg(feature = "testing")]
-use crate::CheckpointConfig;

 struct State {
    conf: &'static PageServerConf,
@@ -79,19 +78,23 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 }

 // Helper function to construct a TimelineInfo struct for a timeline
-fn build_timeline_info(
+async fn build_timeline_info(
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
-    include_non_incremental_physical_size: bool,
 ) -> anyhow::Result<TimelineInfo> {
    let mut info = build_timeline_info_common(timeline)?;
    if include_non_incremental_logical_size {
-        info.current_logical_size_non_incremental =
-            Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
-    }
-    if include_non_incremental_physical_size {
-        info.current_physical_size_non_incremental =
-            Some(timeline.get_physical_size_non_incremental()?)
+        // XXX we should be using spawn_ondemand_logical_size_calculation here.
+        // Otherwise, if someone deletes the timeline / detaches the tenant while
+        // we're executing this function, we will outlive the timeline on-disk state.
+        info.current_logical_size_non_incremental = Some(
+            timeline
+                .get_current_logical_size_non_incremental(
+                    info.last_record_lsn,
+                    CancellationToken::new(),
+                )
+                .await?,
+        );
    }
    Ok(info)
 }
@@ -123,7 +126,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
            None
        }
    };
-    let current_physical_size = Some(timeline.get_physical_size());
+    let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

@@ -140,25 +143,13 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
        current_logical_size,
        current_physical_size,
        current_logical_size_non_incremental: None,
-        current_physical_size_non_incremental: None,
+        timeline_dir_layer_file_size_sum: None,
        wal_source_connstr,
        last_received_msg_lsn,
        last_received_msg_ts,
        pg_version: timeline.pg_version,

        state,
-
-        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
-        // with the control plane.
-        local: LocalTimelineInfo {
-            ancestor_timeline_id,
-            ancestor_lsn,
-            current_logical_size,
-            current_physical_size,
-        },
-        remote: RemoteTimelineInfo {
-            remote_consistent_lsn: Some(remote_consistent_lsn),
-        },
    };
    Ok(info)
 }
@@ -179,7 +170,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
        .new_timeline_id
        .unwrap_or_else(TimelineId::generate);

-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
        .await
        .map_err(ApiError::NotFound)?;
    match tenant.create_timeline(
@@ -205,29 +196,26 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let include_non_incremental_logical_size =
        query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
    check_permission(&request, Some(tenant_id))?;

    let response_data = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
            .await
            .map_err(ApiError::NotFound)?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
        for timeline in timelines {
-            let timeline_info = build_timeline_info(
-                &timeline,
-                include_non_incremental_logical_size,
-                include_non_incremental_physical_size,
-            )
-            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
-            .map_err(ApiError::InternalServerError)?;
+            let timeline_info =
+                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                    .await
+                    .context(
+                        "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
+                    )
+                    .map_err(ApiError::InternalServerError)?;

            response_data.push(timeline_info);
        }
-
        Ok(response_data)
    }
    .instrument(info_span!("timeline_list", tenant = %tenant_id))
@@ -271,12 +259,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size =
        query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
    check_permission(&request, Some(tenant_id))?;

    let timeline_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
            .await
            .map_err(ApiError::NotFound)?;

@@ -284,13 +270,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
            .get_timeline(timeline_id, false)
            .map_err(ApiError::NotFound)?;

-        let timeline_info = build_timeline_info(
-            &timeline,
-            include_non_incremental_logical_size,
-            include_non_incremental_physical_size,
-        )
-        .context("Failed to get local timeline info: {e:#}")
-        .map_err(ApiError::InternalServerError)?;
+        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
+            .await
+            .context("Failed to get local timeline info: {e:#}")
+            .map_err(ApiError::InternalServerError)?;

        Ok::<_, ApiError>(timeline_info)
    }
@@ -311,14 +294,15 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
        .map_err(ApiError::BadRequest)?;
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

-    let timeline = tenant_mgr::get_tenant(tenant_id, true)
+    let timeline = mgr::get_tenant(tenant_id, true)
        .await
        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
        .map_err(ApiError::NotFound)?;
-    let result = match timeline
-        .find_lsn_for_timestamp(timestamp_pg)
-        .map_err(ApiError::InternalServerError)?
-    {
+    let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let result = match result {
        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
        LsnForTimestamp::Future(_lsn) => "future".into(),
        LsnForTimestamp::Past(_lsn) => "past".into(),
@@ -338,7 +322,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,

    if let Some(remote_storage) = &state.remote_storage {
        // FIXME: distinguish between "Tenant already exists" and other errors
-        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
            .instrument(info_span!("tenant_attach", tenant = %tenant_id))
            .await
            .map_err(ApiError::InternalServerError)?;
@@ -356,7 +340,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    tenant_mgr::delete_timeline(tenant_id, timeline_id)
+    mgr::delete_timeline(tenant_id, timeline_id)
        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
        .await
        // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -373,7 +357,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);
    let conf = state.conf;
-    tenant_mgr::detach_tenant(conf, tenant_id)
+    mgr::detach_tenant(conf, tenant_id)
        .instrument(info_span!("tenant_detach", tenant = %tenant_id))
        .await
        // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
@@ -388,7 +372,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
    check_permission(&request, Some(tenant_id))?;

    let state = get_state(&request);
-    tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
        .instrument(info_span!("load", tenant = %tenant_id))
        .await
        .map_err(ApiError::InternalServerError)?;
@@ -402,7 +386,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);
    let conf = state.conf;
-    tenant_mgr::ignore_tenant(conf, tenant_id)
+    mgr::ignore_tenant(conf, tenant_id)
        .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
        .await
        // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
@@ -415,7 +399,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

-    let response_data = tenant_mgr::list_tenants()
+    let response_data = mgr::list_tenants()
        .instrument(info_span!("tenant_list"))
        .await
        .iter()
@@ -435,12 +419,12 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.get_physical_size();
+            current_physical_size += timeline.layer_size_sum().approximate_is_ok();
        }

        let state = tenant.current_state();
@@ -462,7 +446,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -583,7 +567,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo

    let state = get_state(&request);

-    let new_tenant = tenant_mgr::create_tenant(
+    let new_tenant = mgr::create_tenant(
        state.conf,
        tenant_conf,
        target_tenant_id,
@@ -685,7 +669,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    }

    let state = get_state(&request);
-    tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
+    mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
        .instrument(info_span!("tenant_config", tenant = ?tenant_id))
        .await
        // FIXME: `update_tenant_config` can fail because of both user and internal errors.
@@ -737,7 +721,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body

    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

-    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -754,7 +738,7 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
        .await
        .map_err(ApiError::NotFound)?;
    let timeline = tenant
@@ -775,20 +759,63 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
        .await
        .map_err(ApiError::NotFound)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
    timeline
-        .checkpoint(CheckpointConfig::Forced)
+        .freeze_and_flush()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    timeline
+        .compact()
        .await
        .map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, ())
 }

+async fn timeline_download_remote_layers_handler_post(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    match timeline.spawn_download_all_remote_layers().await {
+        Ok(st) => json_response(StatusCode::ACCEPTED, st),
+        Err(st) => json_response(StatusCode::CONFLICT, st),
+    }
+}
+
+async fn timeline_download_remote_layers_handler_get(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    let info = timeline
+        .get_download_all_remote_layers_task_info()
+        .context("task never started since last pageserver process start")
+        .map_err(ApiError::NotFound)?;
+    json_response(StatusCode::OK, info)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -873,6 +900,14 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
            testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_post,
+        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_get,
+        )
        .delete(
            "/v1/tenant/:tenant_id/timeline/:timeline_id",
            timeline_delete_handler,
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -187,13 +187,13 @@ fn import_slru<Reader: Read>(
    path: &Path,
    mut reader: Reader,
    len: usize,
-) -> Result<()> {
-    trace!("importing slru file {}", path.display());
+) -> anyhow::Result<()> {
+    info!("importing slru file {path:?}");

    let mut buf: [u8; 8192] = [0u8; 8192];
    let filename = &path
        .file_name()
-        .expect("missing slru filename")
+        .with_context(|| format!("missing slru filename for path {path:?}"))?
        .to_string_lossy();
    let segno = u32::from_str_radix(filename, 16)?;

@@ -237,14 +237,19 @@ fn import_slru<Reader: Read>(

 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
+fn import_wal(
+    walpath: &Path,
+    tline: &Timeline,
+    startpoint: Lsn,
+    endpoint: Lsn,
+) -> anyhow::Result<()> {
    let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);

    let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
    let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
    let mut last_lsn = startpoint;

-    let mut walingest = WalIngest::new(tline, startpoint)?;
+    let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;

    while last_lsn <= endpoint {
        // FIXME: assume postgresql tli 1 for now
@@ -267,7 +272,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
        }

        let nread = file.read_to_end(&mut buf)?;
-        if nread != WAL_SEGMENT_SIZE - offset as usize {
+        if nread != WAL_SEGMENT_SIZE - offset {
            // Maybe allow this for .partial files?
            error!("read only {} bytes from WAL file", nread);
        }
@@ -279,7 +284,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .no_ondemand_download()?;
                last_lsn = lsn;

                nrecords += 1;
@@ -360,7 +367,7 @@ pub fn import_wal_from_tar<Reader: Read>(
    let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
    let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
    let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn)?;
+    let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;

    // Ingest wal until end_lsn
    info!("importing wal until {}", end_lsn);
@@ -405,7 +412,9 @@ pub fn import_wal_from_tar<Reader: Read>(
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .no_ondemand_download()?;
                last_lsn = lsn;

                debug!("imported record at {} (end {})", lsn, end_lsn);
@@ -440,16 +449,22 @@ fn import_file<Reader: Read>(
    reader: Reader,
    len: usize,
 ) -> Result<Option<ControlFileData>> {
+    let file_name = match file_path.file_name() {
+        Some(name) => name.to_string_lossy(),
+        None => return Ok(None),
+    };
+
+    if file_name.starts_with('.') {
+        // tar archives on macOs, created without COPYFILE_DISABLE=1 env var
+        // will contain "fork files", skip them.
+        return Ok(None);
+    }
+
    if file_path.starts_with("global") {
        let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
        let dbnode = 0;

-        match file_path
-            .file_name()
-            .expect("missing filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
            "pg_control" => {
                let bytes = read_all_bytes(reader)?;

@@ -485,12 +500,7 @@ fn import_file<Reader: Read>(
            .to_string_lossy()
            .parse()?;

-        match file_path
-            .file_name()
-            .expect("missing base filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
            "pg_filenode.map" => {
                let bytes = read_all_bytes(reader)?;
                modification.put_relmap_file(spcnode, dbnode, bytes)?;
@@ -520,11 +530,7 @@ fn import_file<Reader: Read>(
        import_slru(modification, slru, file_path, reader, len)?;
        debug!("imported multixact members slru");
    } else if file_path.starts_with("pg_twophase") {
-        let file_name = &file_path
-            .file_name()
-            .expect("missing twophase filename")
-            .to_string_lossy();
-        let xid = u32::from_str_radix(file_name, 16)?;
+        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;

        let bytes = read_all_bytes(reader)?;
        modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,6 @@
 mod auth;
 pub mod basebackup;
+pub mod billing_metrics;
 pub mod config;
 pub mod http;
 pub mod import_datadir;
@@ -10,13 +11,8 @@ pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
 pub mod repository;
-pub mod storage_sync2;
-pub use storage_sync2 as storage_sync;
 pub mod task_mgr;
 pub mod tenant;
-pub mod tenant_config;
-pub mod tenant_mgr;
-pub mod tenant_tasks;
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
@@ -26,9 +22,8 @@ pub mod walredo;

 use std::path::Path;

-use tracing::info;
-
 use crate::task_mgr::TaskKind;
+use tracing::info;

 /// Current storage format version
 ///
@@ -47,15 +42,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

-/// Config for the Repository checkpointer
-#[derive(Debug, Clone, Copy)]
-pub enum CheckpointConfig {
-    // Flush all in-memory data
-    Flush,
-    // Flush all in-memory data and reconstruct all page images
-    Forced,
-}
-
 pub async fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -66,7 +52,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {

    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
-    tenant_mgr::shutdown_all_tenants().await;
+    tenant::mgr::shutdown_all_tenants().await;

    // Stop syncing with remote storage.
    //
@@ -99,7 +85,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
    }
 }

-fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
    if n == 0 {
        0.0
    } else {
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,13 +84,10 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// Metrics for determining timeline's physical size.
-// A layered timeline's physical is defined as the total size of
-// (delta/image) layer files on disk.
-static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
-        "pageserver_current_physical_size",
-        "Current physical size grouped by timeline",
+        "pageserver_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
@@ -146,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    1.0,      // 1 sec
 ];

-const STORAGE_IO_TIME_OPERATIONS: &[&str] =
-    &["open", "close", "read", "write", "seek", "fsync", "gc"];
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];

 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];

@@ -375,7 +373,7 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: Histogram,
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
-    pub current_physical_size_gauge: UIntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -416,7 +414,7 @@ impl TimelineMetrics {
        let wait_lsn_time_histo = WAIT_LSN_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
+        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
@@ -442,7 +440,7 @@ impl TimelineMetrics {
            load_layer_map_histo,
            last_record_gauge,
            wait_lsn_time_histo,
-            current_physical_size_gauge,
+            resident_physical_size_gauge,
            current_logical_size_gauge,
            num_persistent_files_created,
            persistent_bytes_written,
@@ -458,7 +456,7 @@ impl Drop for TimelineMetrics {
        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -48,10 +48,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, Timeline};
-use crate::tenant_mgr;
 use crate::trace::Tracer;
-use crate::CheckpointConfig;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
@@ -445,9 +444,7 @@ impl PageServerHandler {
        pgb.flush().await?;
        let mut copyin_stream = Box::pin(copyin_stream(pgb));
        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| {
-            import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)
-        })?;
+        tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
        info!("wal import complete");

        // Drain the rest of the Copy data
@@ -466,7 +463,7 @@ impl PageServerHandler {
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush).await?;
+        timeline.freeze_and_flush().await?;

        info!("done");
        Ok(())
@@ -542,7 +539,10 @@ impl PageServerHandler {
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
            .await?;

-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?;
+        let exists = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_exists(req.rel, lsn, req.latest)
+        })
+        .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
            exists,
@@ -559,7 +559,10 @@ impl PageServerHandler {
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
            .await?;

-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?;
+        let n_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_size(req.rel, lsn, req.latest)
+        })
+        .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
            n_blocks,
@@ -576,9 +579,10 @@ impl PageServerHandler {
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
            .await?;

-        let total_blocks =
-            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?;
-
+        let total_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+        })
+        .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

        Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
@@ -604,11 +608,14 @@ impl PageServerHandler {
        }
        */

-        // FIXME: this profiling now happens at different place than it used to. The
-        // current profiling is based on a thread-local variable, so it doesn't work
-        // across awaits
-        let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
-        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
+        let page = crate::tenant::with_ondemand_download(|| {
+            // FIXME: this profiling now happens at different place than it used to. The
+            // current profiling is based on a thread-local variable, so it doesn't work
+            // across awaits
+            let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
+            timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+        })
+        .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -649,7 +656,7 @@ impl PageServerHandler {
        tokio::task::block_in_place(|| {
            let basebackup =
                basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
-            tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str());
+            tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
            basebackup.send_tarball()
        })?;
        pgb.write_message(&BeMessage::CopyDone)?;
@@ -941,7 +948,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
 async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
-    let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;
    match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
        Ok(wait_result) => wait_result
            // no .context(), the error message is good enough and some tests depend on it
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,11 +6,12 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
+use super::tenant::PageReconstructResult;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::repository::*;
-use crate::tenant::Timeline;
+use crate::tenant::{with_ondemand_download, Timeline};
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, ensure, Result};
+use crate::{repository::*, try_no_ondemand_download};
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -19,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::Range;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -33,6 +35,14 @@ pub enum LsnForTimestamp {
    NoData(Lsn),
 }

+#[derive(Debug, thiserror::Error)]
+pub enum CalculateLogicalSizeError {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -88,16 +98,18 @@ impl Timeline {
        blknum: BlockNumber,
        lsn: Lsn,
        latest: bool,
-    ) -> Result<Bytes> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    ) -> PageReconstructResult<Bytes> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }

-        let nblocks = self.get_rel_size(tag, lsn, latest)?;
+        let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
                tag, blknum, lsn, nblocks
            );
-            return Ok(ZERO_PAGE.clone());
+            return PageReconstructResult::Success(ZERO_PAGE.clone());
        }

        let key = rel_block_to_key(tag, blknum);
@@ -105,38 +117,51 @@ impl Timeline {
    }

    // Get size of a database in blocks
-    pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
+    pub fn get_db_size(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+        latest: bool,
+    ) -> PageReconstructResult<usize> {
        let mut total_blocks = 0;

-        let rels = self.list_rels(spcnode, dbnode, lsn)?;
+        let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest)?;
+            let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
            total_blocks += n_blocks as usize;
        }
-        Ok(total_blocks)
+        PageReconstructResult::Success(total_blocks)
    }

    /// Get size of a relation file
-    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_size(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        latest: bool,
+    ) -> PageReconstructResult<BlockNumber> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }

        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(nblocks);
+            return PageReconstructResult::Success(nblocks);
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest)?
+            && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
            // without extending it.  Tolerate that by claiming that
            // any non-existent FSM fork has size 0.
-            return Ok(0);
+            return PageReconstructResult::Success(0);
        }

        let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn)?;
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
        let nblocks = buf.get_u32_le();

        if latest {
@@ -149,43 +174,62 @@ impl Timeline {
            // associated with most recent value of LSN.
            self.update_cached_rel_size(tag, lsn, nblocks);
        }
-        Ok(nblocks)
+        PageReconstructResult::Success(nblocks)
    }

    /// Does relation exist?
-    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_exists(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        _latest: bool,
+    ) -> PageReconstructResult<bool> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }

        // first try to lookup relation in cache
        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(true);
+            return PageReconstructResult::Success(true);
        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));

-        let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
-
-        Ok(exists)
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
    }

    /// Get a list of all existing relations in given tablespace and database.
-    pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
+    pub fn list_rels(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> PageReconstructResult<HashSet<RelTag>> {
        // fetch directory listing
        let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));

-        let rels: HashSet<RelTag> =
-            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
-                spcnode,
-                dbnode,
-                relnode: *relnode,
-                forknum: *forknum,
-            }));
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let rels: HashSet<RelTag> =
+                    HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
+                        spcnode,
+                        dbnode,
+                        relnode: *relnode,
+                        forknum: *forknum,
+                    }));

-        Ok(rels)
+                PageReconstructResult::Success(rels)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
    }

    /// Look up given SLRU page version.
@@ -195,7 +239,7 @@ impl Timeline {
        segno: u32,
        blknum: BlockNumber,
        lsn: Lsn,
-    ) -> Result<Bytes> {
+    ) -> PageReconstructResult<Bytes> {
        let key = slru_block_to_key(kind, segno, blknum);
        self.get(key, lsn)
    }
@@ -206,21 +250,30 @@ impl Timeline {
        kind: SlruKind,
        segno: u32,
        lsn: Lsn,
-    ) -> Result<BlockNumber> {
+    ) -> PageReconstructResult<BlockNumber> {
        let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn)?;
-        Ok(buf.get_u32_le())
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf.get_u32_le())
    }

    /// Get size of an SLRU segment
-    pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
+    pub fn get_slru_segment_exists(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+    ) -> PageReconstructResult<bool> {
        // fetch directory listing
        let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));

-        let exists = dir.segments.get(&segno).is_some();
-        Ok(exists)
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.segments.get(&segno).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
    }

    /// Locate LSN, such that all transactions that committed before
@@ -230,7 +283,10 @@ impl Timeline {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
+    pub fn find_lsn_for_timestamp(
+        &self,
+        search_timestamp: TimestampTz,
+    ) -> PageReconstructResult<LsnForTimestamp> {
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
        let min_lsn = *gc_cutoff_lsn_guard;
        let max_lsn = self.get_last_record_lsn();
@@ -246,12 +302,12 @@ impl Timeline {
            // cannot overflow, high and low are both smaller than u64::MAX / 2
            let mid = (high + low) / 2;

-            let cmp = self.is_latest_commit_timestamp_ge_than(
+            let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
                search_timestamp,
                Lsn(mid * 8),
                &mut found_smaller,
                &mut found_larger,
-            )?;
+            ));

            if cmp {
                high = mid;
@@ -263,15 +319,15 @@ impl Timeline {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
                // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
            }
            (true, false) => {
                // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
            }
            (false, true) => {
                // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
            }
            (true, true) => {
                // low is the LSN of the first commit record *after* the search_timestamp,
@@ -281,7 +337,7 @@ impl Timeline {
                // Otherwise, if you restore to the returned LSN, the database will
                // include physical changes from later commits that will be marked
                // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+                PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
            }
        }
    }
@@ -299,12 +355,20 @@ impl Timeline {
        probe_lsn: Lsn,
        found_smaller: &mut bool,
        found_larger: &mut bool,
-    ) -> Result<bool> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
-            let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
+    ) -> PageReconstructResult<bool> {
+        for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
+            let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
+                SlruKind::Clog,
+                segno,
+                probe_lsn
+            ));
            for blknum in (0..nblocks).rev() {
-                let clog_page =
-                    self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
+                let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
+                    SlruKind::Clog,
+                    segno,
+                    blknum,
+                    probe_lsn
+                ));

                if clog_page.len() == BLCKSZ as usize + 8 {
                    let mut timestamp_bytes = [0u8; 8];
@@ -313,61 +377,75 @@ impl Timeline {

                    if timestamp >= search_timestamp {
                        *found_larger = true;
-                        return Ok(true);
+                        return PageReconstructResult::Success(true);
                    } else {
                        *found_smaller = true;
                    }
                }
            }
        }
-        Ok(false)
+        PageReconstructResult::Success(false)
    }

    /// Get a list of SLRU segments
-    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
+    pub fn list_slru_segments(
+        &self,
+        kind: SlruKind,
+        lsn: Lsn,
+    ) -> PageReconstructResult<HashSet<u32>> {
        // fetch directory entry
        let key = slru_dir_to_key(kind);

-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
-
-        Ok(dir.segments)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.segments),
+            Err(e) => PageReconstructResult::from(e),
+        }
    }

-    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_relmap_file(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> PageReconstructResult<Bytes> {
        let key = relmap_file_key(spcnode, dbnode);

-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
    }

-    pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
+    pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
        // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dir = DbDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));

-        Ok(dir.dbdirs)
+        match DbDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
+            Err(e) => PageReconstructResult::from(e),
+        }
    }

-    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
        let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
    }

-    pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
+    pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
        // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));

-        Ok(dir.xids)
+        match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.xids),
+            Err(e) => PageReconstructResult::from(e),
+        }
    }

-    pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
        self.get(CONTROLFILE_KEY, lsn)
    }

-    pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
        self.get(CHECKPOINT_KEY, lsn)
    }

@@ -376,16 +454,26 @@ impl Timeline {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
-    pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
+    pub async fn get_current_logical_size_non_incremental(
+        &self,
+        lsn: Lsn,
+        cancel: CancellationToken,
+    ) -> Result<u64, CalculateLogicalSizeError> {
        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
+            for rel in
+                crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
+                    .await?
+            {
+                if cancel.is_cancelled() {
+                    return Err(CalculateLogicalSizeError::Cancelled);
+                }
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                let relsize = buf.get_u32_le();

                total_size += relsize as u64;
@@ -398,7 +486,7 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
+    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -406,8 +494,8 @@ impl Timeline {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
+        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;

        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
        dbs.sort_unstable();
@@ -415,15 +503,15 @@ impl Timeline {
            result.add_key(relmap_file_key(spcnode, dbnode));
            result.add_key(rel_dir_to_key(spcnode, dbnode));

-            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)?
-                .iter()
-                .cloned()
-                .collect();
+            let mut rels: Vec<RelTag> =
+                with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
+                    .await?
+                    .into_iter()
+                    .collect();
            rels.sort_unstable();
            for rel in rels {
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                let relsize = buf.get_u32_le();

                result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -439,13 +527,13 @@ impl Timeline {
        ] {
            let slrudir_key = slru_dir_to_key(kind);
            result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn)?;
-            let dir = SlruSegmentDirectory::des(&buf)?;
+            let buf = self.get_download(slrudir_key, lsn).await?;
+            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
            segments.sort_unstable();
            for segno in segments {
                let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn)?;
+                let mut buf = self.get_download(segsize_key, lsn).await?;
                let segsize = buf.get_u32_le();

                result.add_range(
@@ -457,8 +545,8 @@ impl Timeline {

        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
        xids.sort_unstable();
        for xid in xids {
@@ -537,7 +625,7 @@ impl<'a> DatadirModification<'a> {
    ///
    /// This inserts the directory metadata entries that are assumed to
    /// always exist.
-    pub fn init_empty(&mut self) -> Result<()> {
+    pub fn init_empty(&mut self) -> anyhow::Result<()> {
        let buf = DbDirectory::ser(&DbDirectory {
            dbdirs: HashMap::new(),
        })?;
@@ -570,8 +658,8 @@ impl<'a> DatadirModification<'a> {
        rel: RelTag,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -583,7 +671,7 @@ impl<'a> DatadirModification<'a> {
        segno: u32,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        self.put(
            slru_block_to_key(kind, segno, blknum),
            Value::WalRecord(rec),
@@ -597,8 +685,8 @@ impl<'a> DatadirModification<'a> {
        rel: RelTag,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -609,26 +697,26 @@ impl<'a> DatadirModification<'a> {
        segno: u32,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
        Ok(())
    }

    /// Store a relmapper file (pg_filenode.map) in the repository
-    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
+    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
        // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
        let mut dbdir = DbDirectory::des(&buf)?;

        let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
-        if r == None || r == Some(false) {
+        if r.is_none() || r == Some(false) {
            // The dbdir entry didn't exist, or it contained a
            // 'false'. The 'insert' call already updated it with
            // 'true', now write the updated 'dbdirs' map back.
            let buf = DbDirectory::ser(&dbdir)?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
        }
-        if r == None {
+        if r.is_none() {
            // Create RelDirectory
            let buf = RelDirectory::ser(&RelDirectory {
                rels: HashSet::new(),
@@ -643,12 +731,12 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> {
+    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
        // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
        let mut dir = TwoPhaseDirectory::des(&buf)?;
        if !dir.xids.insert(xid) {
-            bail!("twophase file for xid {} already exists", xid);
+            anyhow::bail!("twophase file for xid {} already exists", xid);
        }
        self.put(
            TWOPHASEDIR_KEY,
@@ -659,23 +747,26 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub fn put_control_file(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
        self.put(CONTROLFILE_KEY, Value::Image(img));
        Ok(())
    }

-    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
        self.put(CHECKPOINT_KEY, Value::Image(img));
        Ok(())
    }

-    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
+    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
        let req_lsn = self.tline.get_last_record_lsn();

-        let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
+        let total_blocks = self
+            .tline
+            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .no_ondemand_download()?;

        // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
        let mut dir = DbDirectory::des(&buf)?;
        if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
            let buf = DbDirectory::ser(&dir)?;
@@ -698,11 +789,11 @@ impl<'a> DatadirModification<'a> {
    /// Create a relation fork.
    ///
    /// 'nblocks' is the initial size.
-    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
            // Didn't exist. Update dbdir
@@ -714,12 +805,12 @@ impl<'a> DatadirModification<'a> {
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key)?)?
+            RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
        };

        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            bail!("rel {} already exists", rel);
+            anyhow::bail!("rel {rel} already exists");
        }
        self.put(
            rel_dir_key,
@@ -742,13 +833,17 @@ impl<'a> DatadirModification<'a> {
    }

    /// Truncate relation
-    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
        let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true)? {
+        if self
+            .tline
+            .get_rel_exists(rel, last_lsn, true)
+            .no_ondemand_download()?
+        {
            let size_key = rel_size_to_key(rel);
            // Fetch the old size first
-            let old_size = self.get(size_key)?.get_u32_le();
+            let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();

            // Update the entry with the new size.
            let buf = nblocks.to_le_bytes();
@@ -768,12 +863,12 @@ impl<'a> DatadirModification<'a> {

    /// Extend relation
    /// If new size is smaller, do nothing.
-    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");

        // Put size
        let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();

        // only extend relation here. never decrease the size
        if nblocks > old_size {
@@ -789,12 +884,12 @@ impl<'a> DatadirModification<'a> {
    }

    /// Drop a relation.
-    pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");

        // Remove it from the directory entry
        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
        let mut dir = RelDirectory::des(&buf)?;

        if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -805,7 +900,7 @@ impl<'a> DatadirModification<'a> {

        // update logical size
        let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
        self.pending_nblocks -= old_size as i64;

        // Remove enty from relation size cache
@@ -822,14 +917,14 @@ impl<'a> DatadirModification<'a> {
        kind: SlruKind,
        segno: u32,
        nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        // Add it to the directory entry
        let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
        let mut dir = SlruSegmentDirectory::des(&buf)?;

        if !dir.segments.insert(segno) {
-            bail!("slru segment {:?}/{} already exists", kind, segno);
+            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
        }
        self.put(
            dir_key,
@@ -852,7 +947,7 @@ impl<'a> DatadirModification<'a> {
        kind: SlruKind,
        segno: u32,
        nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
        // Put size
        let size_key = slru_segment_size_to_key(kind, segno);
        let buf = nblocks.to_le_bytes();
@@ -861,10 +956,10 @@ impl<'a> DatadirModification<'a> {
    }

    /// This method is used for marking truncated SLRU files
-    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> {
+    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
        // Remove it from the directory entry
        let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
        let mut dir = SlruSegmentDirectory::des(&buf)?;

        if !dir.segments.remove(&segno) {
@@ -882,15 +977,15 @@ impl<'a> DatadirModification<'a> {
    }

    /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
        // TODO
        Ok(())
    }

    /// This method is used for marking truncated SLRU files
-    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> {
+    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
        // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
        let mut dir = TwoPhaseDirectory::des(&buf)?;

        if !dir.xids.remove(&xid) {
@@ -925,7 +1020,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub fn flush(&mut self) -> Result<()> {
+    pub fn flush(&mut self) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -936,7 +1031,7 @@ impl<'a> DatadirModification<'a> {
        let writer = self.tline.writer();

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: Result<()> = Ok(());
+        let mut result: anyhow::Result<()> = Ok(());
        self.pending_updates.retain(|&key, value| {
            if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
                result = writer.put(key, self.lsn, value);
@@ -984,7 +1079,7 @@ impl<'a> DatadirModification<'a> {

    // Internal helper functions to batch the modifications

-    fn get(&self, key: Key) -> Result<Bytes> {
+    fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
        // Have we already updated the same key? Read the pending updated
        // version in that case.
        //
@@ -992,14 +1087,14 @@ impl<'a> DatadirModification<'a> {
        // value that has been removed, deletion only avoids leaking storage.
        if let Some(value) = self.pending_updates.get(&key) {
            if let Value::Image(img) = value {
-                Ok(img.clone())
+                PageReconstructResult::Success(img.clone())
            } else {
                // Currently, we never need to read back a WAL record that we
                // inserted in the same "transaction". All the metadata updates
                // work directly with Images, and we never need to read actual
                // data pages. We could handle this if we had to, by calling
                // the walredo manager, but let's keep it simple for now.
-                bail!("unexpected pending WAL record");
+                PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
            }
        } else {
            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1327,7 +1422,7 @@ fn twophase_key_range(xid: TransactionId) -> Range<Key> {
        field2: 0,
        field3: 0,
        field4: 0,
-        field5: if overflowed { 1 } else { 0 },
+        field5: u8::from(overflowed),
        field6: next_xid,
    }
 }
@@ -1354,7 +1449,7 @@ const CHECKPOINT_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

-pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
            RelTag {
@@ -1365,7 +1460,7 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
            },
            key.field6,
        ),
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
    })
 }

@@ -1384,21 +1479,21 @@ pub fn is_rel_vm_block_key(key: Key) -> bool {
        && key.field6 != 0xffffffff
 }

-pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
    Ok(match key.field1 {
        0x01 => {
            let kind = match key.field2 {
                0x00 => SlruKind::Clog,
                0x01 => SlruKind::MultiXactMembers,
                0x02 => SlruKind::MultiXactOffsets,
-                _ => bail!("unrecognized slru kind 0x{:02x}", key.field2),
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
            };
            let segno = key.field4;
            let blknum = key.field6;

            (kind, segno, blknum)
        }
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
    })
 }

@@ -1413,7 +1508,7 @@ pub fn create_test_timeline(
    tenant: &crate::tenant::Tenant,
    timeline_id: utils::id::TimelineId,
    pg_version: u32,
-) -> Result<std::sync::Arc<Timeline>> {
+) -> anyhow::Result<std::sync::Arc<Timeline>> {
    let tline = tenant
        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
        .initialize()?;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -25,7 +25,6 @@
 //! the current task has been requested to shut down. You can use that with
 //! Tokio select!().
 //!
-//!
 //! TODO: This would be a good place to also handle panics in a somewhat sane way.
 //! Depending on what task panics, we might want to kill the whole server, or
 //! only a single tenant or timeline.
@@ -36,6 +35,7 @@
 #![allow(clippy::declare_interior_mutable_const)]

 use std::collections::HashMap;
+use std::fmt;
 use std::future::Future;
 use std::panic::AssertUnwindSafe;
 use std::sync::atomic::{AtomicU64, Ordering};
@@ -43,9 +43,9 @@ use std::sync::{Arc, Mutex};

 use futures::FutureExt;
 use tokio::runtime::Runtime;
-use tokio::sync::watch;
 use tokio::task::JoinHandle;
 use tokio::task_local;
+use tokio_util::sync::CancellationToken;

 use tracing::{debug, error, info, warn};

@@ -135,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
        .expect("Failed to create background op runtime")
 });

+#[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);

+impl fmt::Display for PageserverTaskId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Each task that we track is associated with a "task ID". It's just an
 /// increasing number that we assign. Note that it is different from tokio::task::Id.
 static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
@@ -146,11 +153,10 @@ static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
    Lazy::new(|| Mutex::new(HashMap::new()));

 task_local! {
-    // There is a Tokio watch channel for each task, which can be used to signal the
-    // task that it needs to shut down. This task local variable holds the receiving
-    // end of the channel. The sender is kept in the global registry, so that anyone
-    // can send the signal to request task shutdown.
-    static SHUTDOWN_RX: watch::Receiver<bool>;
+    // This is a cancellation token which will be cancelled when a task needs to shut down. The
+    // root token is kept in the global registry, so that anyone can send the signal to request
+    // task shutdown.
+    static SHUTDOWN_TOKEN: CancellationToken;

    // Each task holds reference to its own PageServerTask here.
    static CURRENT_TASK: Arc<PageServerTask>;
@@ -200,11 +206,20 @@ pub enum TaskKind {
    // Task that uploads a file to remote storage
    RemoteUploadTask,

+    // Task that downloads a file from remote storage
+    RemoteDownloadTask,
+
    // task that handles the initial downloading of all tenants
    InitialLoad,

    // task that handles attaching a tenant
    Attach,
+
+    // task that handhes metrics collection
+    MetricsCollection,
+
+    // task that drives downloading layers
+    DownloadAllRemoteLayers,
 }

 #[derive(Default)]
@@ -226,8 +241,8 @@ struct PageServerTask {

    name: String,

-    // To request task shutdown, send 'true' to the channel to notify the task.
-    shutdown_tx: watch::Sender<bool>,
+    // To request task shutdown, just cancel this token.
+    cancel: CancellationToken,

    mutable: Mutex<MutableTaskState>,
 }
@@ -247,13 +262,13 @@ pub fn spawn<F>(
 where
    F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
-    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let cancel = CancellationToken::new();
    let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
    let task = Arc::new(PageServerTask {
        task_id: PageserverTaskId(task_id),
        kind,
        name: name.to_string(),
-        shutdown_tx,
+        cancel: cancel.clone(),
        mutable: Mutex::new(MutableTaskState {
            tenant_id,
            timeline_id,
@@ -271,7 +286,7 @@ where
        task_name,
        task_id,
        task_cloned,
-        shutdown_rx,
+        cancel,
        shutdown_process_on_error,
        future,
    ));
@@ -288,7 +303,7 @@ async fn task_wrapper<F>(
    task_name: String,
    task_id: u64,
    task: Arc<PageServerTask>,
-    shutdown_rx: watch::Receiver<bool>,
+    shutdown_token: CancellationToken,
    shutdown_process_on_error: bool,
    future: F,
 ) where
@@ -296,9 +311,9 @@ async fn task_wrapper<F>(
 {
    debug!("Starting task '{}'", task_name);

-    let result = SHUTDOWN_RX
+    let result = SHUTDOWN_TOKEN
        .scope(
-            shutdown_rx,
+            shutdown_token,
            CURRENT_TASK.scope(task, {
                // We use AssertUnwindSafe here so that the payload function
                // doesn't need to be UnwindSafe. We don't do anything after the
@@ -408,7 +423,7 @@ pub async fn shutdown_tasks(
                && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
-                let _ = task.shutdown_tx.send_replace(true);
+                task.cancel.cancel();
                victim_tasks.push(Arc::clone(task));
            }
        }
@@ -436,24 +451,35 @@ pub fn current_task_kind() -> Option<TaskKind> {
    CURRENT_TASK.try_with(|ct| ct.kind).ok()
 }

+pub fn current_task_id() -> Option<PageserverTaskId> {
+    CURRENT_TASK.try_with(|ct| ct.task_id).ok()
+}
+
 /// A Future that can be used to check if the current task has been requested to
 /// shut down.
 pub async fn shutdown_watcher() {
-    let mut shutdown_rx = SHUTDOWN_RX
-        .try_with(|rx| rx.clone())
+    let token = SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
        .expect("shutdown_requested() called in an unexpected task or thread");

-    while !*shutdown_rx.borrow() {
-        if shutdown_rx.changed().await.is_err() {
-            break;
-        }
-    }
+    token.cancelled().await;
+}
+
+/// Clone the current task's cancellation token, which can be moved across tasks.
+///
+/// When the task which is currently executing is shutdown, the cancellation token will be
+/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
+/// `tokio::task::JoinSet::spawn`.
+pub fn shutdown_token() -> CancellationToken {
+    SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
+        .expect("shutdown_token() called in an unexpected task or thread")
 }

 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
-        *shutdown_rx.borrow()
+    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
+        cancel.is_cancelled()
    } else {
        if !cfg!(test) {
            warn!("is_shutdown_requested() called in an unexpected task or thread");
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,24 +45,25 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::metadata::TimelineMetadata;
+use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
 use crate::repository::GcResult;
-use crate::storage_sync::create_remote_timeline_client;
-use crate::storage_sync::index::IndexPart;
-use crate::storage_sync::list_remote_timelines;
-use crate::storage_sync::RemoteTimelineClient;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::storage_layer::DeltaLayer;
+use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
-use crate::tenant_config::TenantConfOpt;
+
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
-use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;
 pub use pageserver_api::models::TenantState;

 use toml_edit;
@@ -74,23 +75,25 @@ use utils::{

 mod blob_io;
 pub mod block_io;
-mod delta_layer;
 mod disk_btree;
 pub(crate) mod ephemeral_file;
-pub mod filename;
-mod image_layer;
-mod inmemory_layer;
 pub mod layer_map;

 pub mod metadata;
 mod par_fsync;
+mod remote_timeline_client;
 pub mod storage_layer;

+pub mod config;
+pub mod mgr;
+pub mod tasks;
+pub mod upload_queue;
+
 mod timeline;

 pub mod size;

-pub use timeline::Timeline;
+pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline};

 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -125,11 +128,11 @@ pub struct Tenant {
    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
-    // `timelines` mutex during all GC iteration (especially with enforced checkpoint)
+    // `timelines` mutex during all GC iteration
    // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
    // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
    // timeout...
-    gc_cs: Mutex<()>,
+    gc_cs: tokio::sync::Mutex<()>,
    walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,

    // provides access to timeline data sitting in the remote storage
@@ -249,7 +252,7 @@ impl UninitializedTimeline<'_> {
                .context("Failed to import basebackup")
        })?;

-        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // Flush loop needs to be spawned in order to be able to flush.
        // We want to run proper checkpoint before we mark timeline as available to outside world
        // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
        raw_timeline.maybe_spawn_flush_loop();
@@ -259,9 +262,9 @@ impl UninitializedTimeline<'_> {
        });

        raw_timeline
-            .checkpoint(CheckpointConfig::Flush)
+            .freeze_and_flush()
            .await
-            .context("Failed to checkpoint after basebackup import")?;
+            .context("Failed to flush after basebackup import")?;

        let timeline = self.initialize()?;

@@ -336,7 +339,7 @@ impl TimelineUninitMark {
        let uninit_mark_parent = uninit_mark_file
            .parent()
            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| {
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
        })?;
        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
@@ -371,7 +374,7 @@ impl Drop for TimelineUninitMark {

 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
-//     Checkpoint comes, we update local metadata and start upload task but after that
+//     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
 //     pageserver crashes. During startup we'll load new metadata, and then reset it
 //     to the state of remote one. But current layermap will have layers from the old
 //     metadata which is inconsistent.
@@ -480,7 +483,7 @@ impl Tenant {
            let timeline = UninitializedTimeline {
                owning_tenant: self,
                timeline_id,
-                raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
+                raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
            };
            // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote
            // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver
@@ -510,7 +513,7 @@ impl Tenant {
                        )
                        })?;
                    broken_timeline.set_state(TimelineState::Broken);
-                    timelines_accessor.insert(timeline_id, Arc::new(broken_timeline));
+                    timelines_accessor.insert(timeline_id, broken_timeline);
                    Err(e)
                }
            }
@@ -645,8 +648,12 @@ impl Tenant {
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;

-        let remote_timelines =
-            list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
+        let remote_timelines = remote_timeline_client::list_remote_timelines(
+            remote_storage,
+            self.conf,
+            self.tenant_id,
+        )
+        .await?;

        info!("found {} timelines", remote_timelines.len());

@@ -700,6 +707,22 @@ impl Tenant {
        Ok(())
    }

+    /// get size of all remote timelines
+    ///
+    /// This function relies on the index_part instead of listing the remote storage
+    ///
+    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+        let mut size = 0;
+
+        for timeline in self.list_timelines().iter() {
+            if let Some(remote_client) = &timeline.remote_client {
+                size += remote_client.get_remote_physical_size();
+            }
+        }
+
+        Ok(size)
+    }
+
    #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))]
    async fn load_remote_timeline(
        &self,
@@ -714,7 +737,7 @@ impl Tenant {
            .context("Failed to create new timeline directory")?;

        let remote_client =
-            create_remote_timeline_client(remote_storage, self.conf, self.tenant_id, timeline_id)?;
+            RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?;

        let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
            let timelines = self.timelines.lock().unwrap();
@@ -976,7 +999,7 @@ impl Tenant {
            .remote_storage
            .as_ref()
            .map(|remote_storage| {
-                create_remote_timeline_client(
+                RemoteTimelineClient::new(
                    remote_storage.clone(),
                    self.conf,
                    self.tenant_id,
@@ -1142,7 +1165,8 @@ impl Tenant {
                    ancestor_timeline.wait_lsn(*lsn).await?;
                }

-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
+                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                    .await?
            }
            None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
        };
@@ -1154,17 +1178,20 @@ impl Tenant {
    /// this function is periodically called by gc task.
    /// also it can be explicitly requested through page server api 'do_gc' command.
    ///
-    /// 'target_timeline_id' specifies the timeline to GC, or None for all.
-    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
-    /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
-    /// to make tests more deterministic.
-    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
+    /// `target_timeline_id` specifies the timeline to GC, or None for all.
+    ///
+    /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained.
+    /// Also known as the retention period, or the GC cutoff point. `horizon` specifies
+    /// the amount of history, as LSN difference from current latest LSN on each timeline.
+    /// `pitr` specifies the same as a time difference from the current time. The effective
+    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
+    /// requires more history to be retained.
+    //
    pub async fn gc_iteration(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
-        checkpoint_before_gc: bool,
    ) -> anyhow::Result<GcResult> {
        anyhow::ensure!(
            self.is_active(),
@@ -1179,7 +1206,7 @@ impl Tenant {
            let _timer = STORAGE_TIME
                .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
                .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
+            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
                .await
        }
    }
@@ -1222,24 +1249,21 @@ impl Tenant {
    ///
    /// Used at graceful shutdown.
    ///
-    pub async fn checkpoint(&self) -> anyhow::Result<()> {
+    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
        // Scan through the hashmap and collect a list of all the timelines,
        // while holding the lock. Then drop the lock and actually perform the
-        // checkpoints. We don't want to block everything else while the
-        // checkpoint runs.
-        let timelines_to_checkpoint = {
+        // flushing. We don't want to block everything else while the
+        // flushing is performed.
+        let timelines_to_flush = {
            let timelines = self.timelines.lock().unwrap();
            timelines
                .iter()
-                .map(|(id, timeline)| (*id, Arc::clone(timeline)))
+                .map(|(_id, timeline)| Arc::clone(timeline))
                .collect::<Vec<_>>()
        };

-        for (id, timeline) in &timelines_to_checkpoint {
-            timeline
-                .checkpoint(CheckpointConfig::Flush)
-                .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
-                .await?;
+        for timeline in &timelines_to_flush {
+            timeline.freeze_and_flush().await?;
        }

        Ok(())
@@ -1274,26 +1298,62 @@ impl Tenant {
            timeline
        };

-        info!("waiting for layer_removal_cs.lock()");
-        // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
-        let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-        info!("got layer_removal_cs.lock(), deleting layer files");
+        // Now that the Timeline is in Stopping state, request all the related tasks to
+        // shut down.
+        //
+        // NB: If you call delete_timeline multiple times concurrently, they will
+        // all go through the motions here. Make sure the code here is idempotent,
+        // and don't error out if some of the shutdown tasks have already been
+        // completed!

-        // NB: storage_sync upload tasks that reference these layers have been cancelled
-        //     by the caller.
+        // Stop the walreceiver first.
+        debug!("waiting for wal receiver to shutdown");
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_id),
+            Some(timeline_id),
+        )
+        .await;
+        debug!("wal receiver shutdown confirmed");

-        let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-        // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-        // with some layers missing.
-        std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
-            format!(
-                "Failed to remove local timeline directory '{}'",
-                local_timeline_directory.display()
-            )
-        })?;
-        info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+        info!("waiting for timeline tasks to shutdown");
+        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;

-        drop(layer_removal_guard);
+        {
+            // Grab the layer_removal_cs lock, and actually perform the deletion.
+            //
+            // This lock prevents multiple concurrent delete_timeline calls from
+            // stepping on each other's toes, while deleting the files. It also
+            // prevents GC or compaction from running at the same time.
+            //
+            // Note that there are still other race conditions between
+            // GC, compaction and timeline deletion. GC task doesn't
+            // register itself properly with the timeline it's
+            // operating on. See
+            // https://github.com/neondatabase/neon/issues/2671
+            //
+            // No timeout here, GC & Compaction should be responsive to the
+            // `TimelineState::Stopping` change.
+            info!("waiting for layer_removal_cs.lock()");
+            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+            info!("got layer_removal_cs.lock(), deleting layer files");
+
+            // NB: storage_sync upload tasks that reference these layers have been cancelled
+            //     by the caller.
+
+            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
+            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
+            // with some layers missing.
+            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
+                format!(
+                    "Failed to remove local timeline directory '{}'",
+                    local_timeline_directory.display()
+                )
+            })?;
+
+            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            drop(layer_removal_guard);
+        }

        // Remove the timeline from the map.
        let mut timelines = self.timelines.lock().unwrap();
@@ -1371,7 +1431,7 @@ impl Tenant {

                    // Spawn gc and compaction loops. The loops will shut themselves
                    // down when they notice that the tenant is inactive.
-                    crate::tenant_tasks::start_background_loops(self.tenant_id);
+                    tasks::start_background_loops(self.tenant_id);

                    for timeline in not_broken_timelines {
                        timeline.set_state(TimelineState::Active);
@@ -1595,7 +1655,7 @@ impl Tenant {
        new_metadata: TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
        if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
            anyhow::ensure!(
                ancestor.is_some(),
@@ -1631,7 +1691,7 @@ impl Tenant {
            conf,
            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
            timelines: Mutex::new(HashMap::new()),
-            gc_cs: Mutex::new(()),
+            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
            state,
@@ -1778,12 +1838,13 @@ impl Tenant {
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
-        checkpoint_before_gc: bool,
    ) -> anyhow::Result<GcResult> {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

-        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
+        let gc_timelines = self
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await?;

        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");

@@ -1805,18 +1866,6 @@ impl Tenant {
                // made.
                break;
            }
-
-            // If requested, force flush all in-memory layers to disk first,
-            // so that they too can be garbage collected. That's
-            // used in tests, so we want as deterministic results as possible.
-            if checkpoint_before_gc {
-                timeline.checkpoint(CheckpointConfig::Forced).await?;
-                info!(
-                    "timeline {} checkpoint_before_gc done",
-                    timeline.timeline_id
-                );
-            }
-
            let result = timeline.gc().await?;
            totals += result;
        }
@@ -1830,7 +1879,7 @@ impl Tenant {
    /// [`Tenant::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // since this method can now be called at different rates than the configured gc loop, it
        // might be that these configuration values get applied faster than what it was previously,
        // since these were only read from the gc task.
@@ -1841,54 +1890,60 @@ impl Tenant {
        let target_timeline_id = None;

        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await
    }

-    fn refresh_gc_info_internal(
+    async fn refresh_gc_info_internal(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
        // grab mutex to prevent new timelines from being created here.
-        let gc_cs = self.gc_cs.lock().unwrap();
-
-        let timelines = self.timelines.lock().unwrap();
+        let gc_cs = self.gc_cs.lock().await;

        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
-        let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new();
-        let timeline_ids = {
-            if let Some(target_timeline_id) = target_timeline_id.as_ref() {
-                if timelines.get(target_timeline_id).is_none() {
-                    bail!("gc target timeline does not exist")
-                }
-            };
+        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+            let timelines = self.timelines.lock().unwrap();
+            let mut all_branchpoints = BTreeSet::new();
+            let timeline_ids = {
+                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
+                    if timelines.get(target_timeline_id).is_none() {
+                        bail!("gc target timeline does not exist")
+                    }
+                };

-            timelines
-                .iter()
-                .map(|(timeline_id, timeline_entry)| {
-                    if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
-                        // If target_timeline is specified, we only need to know branchpoints of its children
-                        if let Some(timeline_id) = target_timeline_id {
-                            if ancestor_timeline_id == &timeline_id {
+                timelines
+                    .iter()
+                    .map(|(timeline_id, timeline_entry)| {
+                        if let Some(ancestor_timeline_id) =
+                            &timeline_entry.get_ancestor_timeline_id()
+                        {
+                            // If target_timeline is specified, we only need to know branchpoints of its children
+                            if let Some(timeline_id) = target_timeline_id {
+                                if ancestor_timeline_id == &timeline_id {
+                                    all_branchpoints.insert((
+                                        *ancestor_timeline_id,
+                                        timeline_entry.get_ancestor_lsn(),
+                                    ));
+                                }
+                            }
+                            // Collect branchpoints for all timelines
+                            else {
                                all_branchpoints.insert((
                                    *ancestor_timeline_id,
                                    timeline_entry.get_ancestor_lsn(),
                                ));
                            }
                        }
-                        // Collect branchpoints for all timelines
-                        else {
-                            all_branchpoints
-                                .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn()));
-                        }
-                    }

-                    *timeline_id
-                })
-                .collect::<Vec<_>>()
+                        *timeline_id
+                    })
+                    .collect::<Vec<_>>()
+            };
+            (all_branchpoints, timeline_ids)
        };
-        drop(timelines);

        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
@@ -1914,7 +1969,7 @@ impl Tenant {
                    ))
                    .map(|&x| x.1)
                    .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr)?;
+                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;

                gc_timelines.push(timeline);
            }
@@ -1924,7 +1979,7 @@ impl Tenant {
    }

    /// Branch an existing timeline
-    fn branch_timeline(
+    async fn branch_timeline(
        &self,
        src: TimelineId,
        dst: TimelineId,
@@ -1933,10 +1988,11 @@ impl Tenant {
        // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
        // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
        // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().unwrap();
-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?;
-        drop(timelines);
+        let _gc_cs = self.gc_cs.lock().await;
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst, &timelines)?
+        };

        // In order for the branch creation task to not wait for GC/compaction,
        // we need to make sure that the starting LSN of the child branch is not out of scope midway by
@@ -2105,8 +2161,13 @@ impl Tenant {
        });

        unfinished_timeline
-            .checkpoint(CheckpointConfig::Flush).await
-            .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
+            .freeze_and_flush()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}"
+                )
+            })?;

        let timeline = {
            let mut timelines = self.timelines.lock().unwrap();
@@ -2135,7 +2196,7 @@ impl Tenant {
        let tenant_id = self.tenant_id;

        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = create_remote_timeline_client(
+            let remote_client = RemoteTimelineClient::new(
                remote_storage.clone(),
                self.conf,
                tenant_id,
@@ -2165,7 +2226,7 @@ impl Tenant {
                Ok(UninitializedTimeline {
                    owning_tenant: self,
                    timeline_id: new_timeline_id,
-                    raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
+                    raw_timeline: Some((new_timeline, uninit_mark)),
                })
            }
            Err(e) => {
@@ -2183,7 +2244,7 @@ impl Tenant {
        new_metadata: TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
        let timeline_data = self
            .create_timeline_data(
                new_timeline_id,
@@ -2266,12 +2327,12 @@ impl Tenant {
        // See more for on the issue #2748 condenced out of the initial PR review.
        let mut shared_cache = self.cached_logical_sizes.lock().await;

-        size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
    }
 }

 fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
-    fs::remove_dir_all(&timeline_dir)
+    fs::remove_dir_all(timeline_dir)
        .or_else(|e| {
            if e.kind() == std::io::ErrorKind::NotFound {
                // we can leave the uninit mark without a timeline dir,
@@ -2287,7 +2348,7 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a
                timeline_dir.display()
            )
        })?;
-    fs::remove_file(&uninit_mark).with_context(|| {
+    fs::remove_file(uninit_mark).with_context(|| {
        format!(
            "Failed to remove timeline uninit mark file {}",
            uninit_mark.display()
@@ -2387,7 +2448,7 @@ fn try_create_target_tenant_dir(
        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
    });

-    fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
+    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
        format!(
            "failed to move tenant {} temporary directory {} into the permanent one {}",
            tenant_id,
@@ -2441,9 +2502,9 @@ fn run_initdb(
    );

    let initdb_output = Command::new(&initdb_bin_path)
-        .args(&["-D", &initdb_target_dir.to_string_lossy()])
-        .args(&["-U", &conf.superuser])
-        .args(&["-E", "utf8"])
+        .args(["-D", &initdb_target_dir.to_string_lossy()])
+        .args(["-U", &conf.superuser])
+        .args(["-E", "utf8"])
        .arg("--no-instructions")
        // This is only used for a temporary installation that is deleted shortly after,
        // so no need to fsync it
@@ -2486,12 +2547,8 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<()
    file.read_exact_at(&mut header_buf, 0)?;

    match u16::from_be_bytes(header_buf) {
-        crate::IMAGE_FILE_MAGIC => {
-            image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)?
-        }
-        crate::DELTA_FILE_MAGIC => {
-            delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)?
-        }
+        crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?,
+        crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?,
        magic => bail!("unrecognized magic identifier: {:?}", magic),
    }

@@ -2528,7 +2585,7 @@ pub mod harness {
    };

    use super::*;
-    use crate::tenant_config::{TenantConf, TenantConfOpt};
+    use crate::tenant::config::{TenantConf, TenantConfOpt};
    use hex_literal::hex;
    use utils::id::{TenantId, TimelineId};

@@ -2605,9 +2662,11 @@ pub mod harness {

            // Disable automatic GC and compaction to make the unit tests more deterministic.
            // The tests perform them manually if needed.
-            let mut tenant_conf = TenantConf::dummy_conf();
-            tenant_conf.gc_period = Duration::ZERO;
-            tenant_conf.compaction_period = Duration::ZERO;
+            let tenant_conf = TenantConf {
+                gc_period: Duration::ZERO,
+                compaction_period: Duration::ZERO,
+                ..TenantConf::default()
+            };

            let tenant_id = TenantId::generate();
            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
@@ -2726,9 +2785,18 @@ mod tests {
        writer.finish_write(Lsn(0x20));
        drop(writer);

-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );

        Ok(())
    }
@@ -2793,7 +2861,9 @@ mod tests {
        //assert_current_logical_size(&tline, Lsn(0x40));

        // Branch the history, modify relation differently on the new timeline
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .await?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
@@ -2803,15 +2873,15 @@ mod tests {

        // Check page contents on both branches
        assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
            "foo at 0x40"
        );
        assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
            "bar at 0x40"
        );
        assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?,
            "foobar at 0x20"
        );

@@ -2841,7 +2911,7 @@ mod tests {
            writer.finish_write(lsn);
            lsn += 0x10;
        }
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
        {
            let writer = tline.writer();
            writer.put(
@@ -2858,7 +2928,7 @@ mod tests {
            )?;
            writer.finish_write(lsn);
        }
-        tline.checkpoint(CheckpointConfig::Forced).await
+        tline.freeze_and_flush().await
    }

    #[tokio::test]
@@ -2873,15 +2943,18 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        // FIXME: this doesn't actually remove any layer currently, given how the checkpointing
+        // FIXME: this doesn't actually remove any layer currently, given how the flushing
        // and compaction works. But it does set the 'cutoff' point so that the cross check
        // below should fail.
        tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
            .await?;

        // try to branch at lsn 25, should fail because we already garbage collected the data
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
            Ok(_) => panic!("branching should have failed"),
            Err(err) => {
                assert!(err.to_string().contains("invalid branch start lsn"));
@@ -2906,7 +2979,10 @@ mod tests {
            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
            .initialize()?;
        // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
            Ok(_) => panic!("branching should have failed"),
            Err(err) => {
                assert!(&err.to_string().contains("invalid branch start lsn"));
@@ -2933,7 +3009,7 @@ mod tests {
        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
        assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
        match tline.get(*TEST_KEY, Lsn(0x25)) {
@@ -2954,15 +3030,20 @@ mod tests {
            .initialize()?;
        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
            .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
+        assert!(newtline
+            .get(*TEST_KEY, Lsn(0x25))
+            .no_ondemand_download()
+            .is_ok());

        Ok(())
    }
@@ -2976,7 +3057,9 @@ mod tests {
            .initialize()?;
        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
@@ -2985,12 +3068,12 @@ mod tests {

        // run gc on parent
        tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
            .await?;

        // Check that the data is still accessible on the branch.
        assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50))?,
+            newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?,
            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
        );

@@ -3007,7 +3090,6 @@ mod tests {
                .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
                .initialize()?;
            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
        }

        let tenant = harness.load().await;
@@ -3030,16 +3112,16 @@ mod tests {
                .initialize()?;

            make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;

-            tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+            tenant
+                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .await?;

            let newtline = tenant
                .get_timeline(NEW_TIMELINE_ID, true)
                .expect("Should have a local timeline");

            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
        }

        // check that both of them are initially unloaded
@@ -3111,7 +3193,7 @@ mod tests {
        writer.finish_write(Lsn(0x10));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
        tline.compact().await?;

        let writer = tline.writer();
@@ -3119,7 +3201,7 @@ mod tests {
        writer.finish_write(Lsn(0x20));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
        tline.compact().await?;

        let writer = tline.writer();
@@ -3127,7 +3209,7 @@ mod tests {
        writer.finish_write(Lsn(0x30));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
        tline.compact().await?;

        let writer = tline.writer();
@@ -3135,21 +3217,36 @@ mod tests {
        writer.finish_write(Lsn(0x40));
        drop(writer);

-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
        tline.compact().await?;

-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x30")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x40")
+        );

        Ok(())
    }

    //
-    // Insert 1000 key-value pairs with increasing keys, checkpoint,
-    // repeat 50 times.
+    // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
+    // Repeat 50 times.
    //
    #[tokio::test]
    async fn test_bulk_insert() -> anyhow::Result<()> {
@@ -3184,8 +3281,10 @@ mod tests {

            let cutoff = tline.get_last_record_lsn();

-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
+            tline.freeze_and_flush().await?;
            tline.compact().await?;
            tline.gc().await?;
        }
@@ -3248,16 +3347,17 @@ mod tests {
            for (blknum, last_lsn) in updated.iter().enumerate() {
                test_key.field6 = blknum as u32;
                assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                );
            }

-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
            let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
+            tline.freeze_and_flush().await?;
            tline.compact().await?;
            tline.gc().await?;
        }
@@ -3305,7 +3405,9 @@ mod tests {
        let mut tline_id = TIMELINE_ID;
        for _ in 0..50 {
            let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
            tline = tenant
                .get_timeline(new_tline_id, true)
                .expect("Should have the branched timeline");
@@ -3331,16 +3433,17 @@ mod tests {
            for (blknum, last_lsn) in updated.iter().enumerate() {
                test_key.field6 = blknum as u32;
                assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                    TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                );
            }

-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
            let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
+            tline.freeze_and_flush().await?;
            tline.compact().await?;
            tline.gc().await?;
        }
@@ -3370,7 +3473,9 @@ mod tests {
        #[allow(clippy::needless_range_loop)]
        for idx in 0..NUM_TLINES {
            let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
            tline = tenant
                .get_timeline(new_tline_id, true)
                .expect("Should have the branched timeline");
@@ -3403,7 +3508,7 @@ mod tests {
                println!("checking [{idx}][{blknum}] at {lsn}");
                test_key.field6 = blknum as u32;
                assert_eq!(
-                    tline.get(test_key, *lsn)?,
+                    tline.get(test_key, *lsn).no_ondemand_download()?,
                    TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                );
            }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -191,11 +191,10 @@ impl TenantConfOpt {
    }
 }

-impl TenantConf {
-    pub fn default() -> TenantConf {
+impl Default for TenantConf {
+    fn default() -> Self {
        use defaults::*;
-
-        TenantConf {
+        Self {
            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
                .expect("cannot parse default checkpoint timeout"),
@@ -220,29 +219,4 @@ impl TenantConf {
            trace_read_requests: false,
        }
    }
-
-    pub fn dummy_conf() -> Self {
-        TenantConf {
-            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: Duration::from_secs(600),
-            compaction_target_size: 4 * 1024 * 1024,
-            compaction_period: Duration::from_secs(10),
-            compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
-            gc_horizon: defaults::DEFAULT_GC_HORIZON,
-            gc_period: Duration::from_secs(10),
-            image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: Duration::from_secs(60 * 60),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .unwrap(),
-            lagging_wal_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
-            )
-            .unwrap(),
-            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .unwrap(),
-            trace_read_requests: false,
-        }
-    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
        off += keys_len as u64;

        let values_off = off as usize;
-        let values_len = num_children as usize * VALUE_SZ as usize;
+        let values_len = num_children as usize * VALUE_SZ;
        //off += values_len as u64;

        let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
@@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
        while low < high {
            let mid = low + size / 2;

-            let key_off = mid as usize * self.suffix_len as usize;
+            let key_off = mid * self.suffix_len as usize;
            let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
            // Does this match?
            keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
@@ -328,7 +328,7 @@ where
            while idx < node.num_children as usize {
                let suffix = &node.keys[key_off..key_off + suffix_len];
                keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                #[allow(clippy::collapsible_if)]
                if node.level == 0 {
                    // leaf
@@ -368,7 +368,7 @@ where
                key_off -= suffix_len;
                let suffix = &node.keys[key_off..key_off + suffix_len];
                keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                #[allow(clippy::collapsible_if)]
                if node.level == 0 {
                    // leaf
@@ -629,7 +629,7 @@ impl<const L: usize> BuildNode<L> {
        self.keys.extend(&key[self.prefix.len()..]);
        self.values.extend(value.0);

-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
        assert!(self.values.len() == self.num_children as usize * VALUE_SZ);

        self.size += self.suffix_len + VALUE_SZ;
@@ -674,7 +674,7 @@ impl<const L: usize> BuildNode<L> {
        self.size -= prefix_len * self.num_children as usize;
        self.size += prefix_len;

-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
        assert!(self.values.len() == self.num_children as usize * VALUE_SZ);

        true
@@ -684,7 +684,7 @@ impl<const L: usize> BuildNode<L> {
    /// Serialize the node to on-disk format.
    ///
    fn pack(&self) -> Bytes {
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
        assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
        assert!(self.num_children > 0);

@@ -940,7 +940,7 @@ mod tests {
            let t = -(f64::ln(u));
            let key_int = (t * 1000000.0) as u128;

-            all_data.insert(key_int as u128, idx as u64);
+            all_data.insert(key_int, idx as u64);
        }

        // Build a tree from it
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -91,7 +91,7 @@ impl EphemeralFile {
                break;
            }

-            off += n as usize;
+            off += n;
        }
        Ok(())
    }
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -12,7 +12,6 @@

 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::inmemory_layer::InMemoryLayer;
 use crate::tenant::storage_layer::{range_eq, range_overlaps};
 use amplify_num::i256;
 use anyhow::Result;
@@ -27,7 +26,7 @@ use std::sync::Arc;
 use tracing::*;
 use utils::lsn::Lsn;

-use super::storage_layer::Layer;
+use super::storage_layer::{InMemoryLayer, Layer};

 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -261,7 +260,7 @@ where
    /// contain the version, even if it's missing from the returned
    /// layer.
    ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult<L>>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
        // linear search
        // Find the latest image layer that covers the given key
        let mut latest_img: Option<Arc<L>> = None;
@@ -286,10 +285,10 @@ where
            assert!(img_lsn < end_lsn);
            if Lsn(img_lsn.0 + 1) == end_lsn {
                // found exact match
-                return Ok(Some(SearchResult {
+                return Some(SearchResult {
                    layer: Arc::clone(l),
                    lsn_floor: img_lsn,
-                }));
+                });
            }
            if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
                latest_img = Some(Arc::clone(l));
@@ -327,14 +326,16 @@ where
                latest_delta.replace(Arc::clone(l));
                break;
            }
-            // this layer's end LSN is smaller than the requested point. If there's
-            // nothing newer, this is what we need to return. Remember this.
-            if let Some(old_candidate) = &latest_delta {
-                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
+                // this layer's end LSN is smaller than the requested point. If there's
+                // nothing newer, this is what we need to return. Remember this.
+                if let Some(old_candidate) = &latest_delta {
+                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+                        latest_delta.replace(Arc::clone(l));
+                    }
+                } else {
                    latest_delta.replace(Arc::clone(l));
                }
-            } else {
-                latest_delta.replace(Arc::clone(l));
            }
        }
        if let Some(l) = latest_delta {
@@ -346,19 +347,19 @@ where
                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
                l.get_lsn_range().start,
            );
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                lsn_floor,
                layer: l,
-            }))
+            })
        } else if let Some(l) = latest_img {
            trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                lsn_floor: latest_img_lsn.unwrap(),
                layer: l,
-            }))
+            })
        } else {
            trace!("no layer found for request on {key} at {end_lsn}");
-            Ok(None)
+            None
        }
    }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -255,8 +255,7 @@ pub fn save_metadata(
    // fsync the parent directory to ensure the directory entry is durable
    if first_save {
        let timeline_dir = File::open(
-            &path
-                .parent()
+            path.parent()
                .expect("Metadata should always have a parent dir"),
        )?;
        timeline_dir.sync_all()?;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -17,8 +17,8 @@ use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::task_mgr::{self, TaskKind};
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_config::TenantConfOpt;
 use crate::IGNORED_TENANT_FILE_NAME;

 use utils::fs_ext::PathExt;
@@ -196,7 +196,7 @@ pub async fn shutdown_all_tenants() {
        let tenant_id = tenant.tenant_id();
        debug!("shutdown tenant {tenant_id}");

-        if let Err(err) = tenant.checkpoint().await {
+        if let Err(err) = tenant.freeze_and_flush().await {
            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
        }
    }
@@ -216,8 +216,7 @@ pub async fn create_tenant(
        hash_map::Entry::Vacant(v) => {
            // Hold the write_tenants() lock, since all of this is local IO.
            // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory =
-                super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
+            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
            let created_tenant =
                schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
            let crated_tenant_id = created_tenant.tenant_id();
@@ -262,27 +261,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
 }

 pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
-    // Start with the shutdown of timeline tasks (this shuts down the walreceiver)
-    // It is important that we do not take locks here, and do not check whether the timeline exists
-    // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
-    // we cannot create new timelines and tenants, and that can take quite some time,
-    // it can even become stuck due to a bug making whole pageserver unavailable for some operations
-    // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
-    // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
-    // will synchronize and either fail with the not found error or succeed
-
-    debug!("waiting for wal receiver to shutdown");
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::WalReceiverManager),
-        Some(tenant_id),
-        Some(timeline_id),
-    )
-    .await;
-    debug!("wal receiver shutdown confirmed");
-
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
-    info!("timeline task shutdown completed");
    match get_tenant(tenant_id, true).await {
        Ok(tenant) => {
            tenant.delete_timeline(timeline_id).await?;
@@ -496,7 +474,7 @@ pub async fn immediate_gc(
        async move {
            fail::fail_point!("immediate_gc_task_pre");
            let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
                .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                .await;
                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -32,7 +32,8 @@
 //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
 //!
 //! - [`RemoteTimelineClient::schedule_layer_file_upload`]  when we've created a new layer file.
-//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
 //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
 //!
 //! Internally, these functions create [`UploadOp`]s and put them in a queue.
@@ -57,7 +58,7 @@
 //! To have a consistent remote structure, it's important that uploads and
 //! deletions are performed in the right order. For example, the index file
 //! contains a list of layer files, so it must not be uploaded until all the
-//! layer files that are in its list have been succesfully uploaded.
+//! layer files that are in its list have been successfully uploaded.
 //!
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
@@ -139,7 +140,7 @@
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
 //! the file is leaked in the remote storage. Similarly, if a new file is created
-//! and uploaded, but the pageserver dies permantently before updating the
+//! and uploaded, but the pageserver dies permanently before updating the
 //! remote index file, the new file is leaked in remote storage. We accept and
 //! tolerate that for now.
 //! Note further that we cannot easily fix this by scheduling deletes for every
@@ -147,31 +148,43 @@
 //! following two cases:
 //! - (1) We had the file locally, deleted it locally, scheduled a remote delete,
 //!   but crashed before it finished remotely.
-//! - (2) We never had the file locally because we were still in tenant attach
-//!   when we crashed. (Similar case for on-demand download in the future.)
+//! - (2) We never had the file locally because we haven't on-demand downloaded
+//!   it yet.
 //!
-//! # Downloads (= Tenant Attach)
+//! # Downloads
 //!
 //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
-//! downloading files from the remote storage. Downloads are performed immediately,
-//! independently of the uploads.
+//! downloading files from the remote storage. Downloads are performed immediately
+//! against the `RemoteStorage`, independently of the upload queue.
 //!
 //! When we attach a tenant, we perform the following steps:
 //! - create `Tenant` object in `TenantState::Attaching` state
-//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
-//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart`
-//! - eagerly download all the remote layers using the client's download APIs
-//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state.
+//! - List timelines that are present in remote storage, and for each:
+//!   - download their remote [`IndexPart`]s
+//!   - create `Timeline` struct and a `RemoteTimelineClient`
+//!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
+//!     but not present locally
+//!   - schedule uploads for layers that are only present locally.
+//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
+//!     the local filesystem, write the remote metadata to the local filesystem
+//! - After the above is done for each timeline, open the tenant for business by
+//!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
+//!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! Most of the above happens in [`Timeline::reconcile_with_remote`].
+//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
-//! file on the local disk.
-//! However, the distinction is moot for storage sync since we call
-//! `reconcile_with_remote` for tenants both with and without the marker file.
-//!
-//! In the future, downloading will be done on-demand and `reconcile_with_remote`
-//! will only be responsible for re-scheduling upload ops after a crash of an
-//! `Active` tenant.
+//! file on the local disk. This is critical because, when we restart the pageserver,
+//! we do not want to do the `List timelines` step for each tenant that has already
+//! been successfully attached (for performance & cost reasons).
+//! Instead, for a tenant without the attach marker file, we assume that the
+//! local state is in sync or ahead of the remote state. This includes the list
+//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
+//! if there's a timeline on the remote that the pageserver doesn't know about,
+//! the GC will not consider its branch point, leading to data loss.
+//! So, for a tenant with the attach marker file, we know that we do not yet have
+//! persisted all the remote timeline's metadata files locally. To exclude the
+//! risk above, we re-run the procedure for such tenants
 //!
 //! # Operating Without Remote Storage
 //!
@@ -194,38 +207,51 @@ mod upload;
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};

-use std::collections::{HashMap, VecDeque};
-use std::fmt::Debug;
-use std::ops::DerefMut;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
+use std::ops::DerefMut;
 use tokio::runtime::Runtime;
 use tracing::{info, warn};
 use tracing::{info_span, Instrument};
-
 use utils::lsn::Lsn;

-use self::index::IndexPart;
-
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
 use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
-use crate::tenant::filename::LayerFileName;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
    config::PageServerConf,
-    storage_sync::index::LayerFileMetadata,
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::BACKGROUND_RUNTIME,
    tenant::metadata::TimelineMetadata,
+    tenant::upload_queue::{
+        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
+    },
    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };

 use utils::id::{TenantId, TimelineId};

+use self::index::IndexPart;
+
+use super::storage_layer::LayerFileName;
+
+// Occasional network issues and such can cause remote operations to fail, and
+// that's expected. If a download fails, we log it at info-level, and retry.
+// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
+// level instead, as repeated failures can mean a more serious problem. If it
+// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
+const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+
+// Similarly log failed uploads and deletions at WARN level, after this many
+// retries. Uploads and deletions are retried forever, though.
+const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -260,200 +286,30 @@ pub struct RemoteTimelineClient {
    storage_impl: GenericRemoteStorage,
 }

-// clippy warns that Uninitialized is much smaller than Initialized, which wastes
-// memory for Uninitialized variants. Doesn't matter in practice, there are not
-// that many upload queues in a running pageserver, and most of them are initialized
-// anyway.
-#[allow(clippy::large_enum_variant)]
-enum UploadQueue {
-    Uninitialized,
-    Initialized(UploadQueueInitialized),
-    Stopped(UploadQueueStopped),
-}
-
-impl UploadQueue {
-    fn as_str(&self) -> &'static str {
-        match self {
-            UploadQueue::Uninitialized => "Uninitialized",
-            UploadQueue::Initialized(_) => "Initialized",
-            UploadQueue::Stopped(_) => "Stopped",
-        }
-    }
-}
-
-/// This keeps track of queued and in-progress tasks.
-struct UploadQueueInitialized {
-    /// Counter to assign task IDs
-    task_counter: u64,
-
-    /// All layer files stored in the remote storage, taking into account all
-    /// in-progress and queued operations
-    latest_files: HashMap<LayerFileName, LayerFileMetadata>,
-
-    /// Metadata stored in the remote storage, taking into account all
-    /// in-progress and queued operations.
-    /// DANGER: do not return to outside world, e.g., safekeepers.
-    latest_metadata: TimelineMetadata,
-
-    /// `disk_consistent_lsn` from the last metadata file that was successfully
-    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
-    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
-    /// Safekeeper can rely on it to make decisions for WAL storage.
-    last_uploaded_consistent_lsn: Lsn,
-
-    // Breakdown of different kinds of tasks currently in-progress
-    num_inprogress_layer_uploads: usize,
-    num_inprogress_metadata_uploads: usize,
-    num_inprogress_deletions: usize,
-
-    /// Tasks that are currently in-progress. In-progress means that a tokio Task
-    /// has been launched for it. An in-progress task can be busy uploading, but it can
-    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
-    /// be waiting for retry in `exponential_backoff`.
-    inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
-
-    /// Queued operations that have not been launched yet. They might depend on previous
-    /// tasks to finish. For example, metadata upload cannot be performed before all
-    /// preceding layer file uploads have completed.
-    queued_operations: VecDeque<UploadOp>,
-}
-
-struct UploadQueueStopped {
-    last_uploaded_consistent_lsn: Lsn,
-}
-
-impl UploadQueue {
-    fn initialize_empty_remote(
-        &mut self,
-        metadata: &TimelineMetadata,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        info!("initializing upload queue for empty remote");
-
-        let state = UploadQueueInitialized {
-            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: HashMap::new(),
-            latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialize_with_current_remote_index_part(
-        &mut self,
-        index_part: &IndexPart,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
-        for layer_name in &index_part.timeline_layers {
-            let layer_metadata = index_part
-                .layer_metadata
-                .get(layer_name)
-                .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(layer_name.to_owned(), layer_metadata);
-        }
-
-        let index_part_metadata = index_part.parse_metadata()?;
-        info!(
-            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part_metadata.disk_consistent_lsn()
-        );
-
-        let state = UploadQueueInitialized {
-            latest_files: files,
-            latest_metadata: index_part_metadata.clone(),
-            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => Ok(x),
-        }
-    }
-}
-
-/// An in-progress upload or delete task.
-#[derive(Debug)]
-struct UploadTask {
-    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
-    task_id: u64,
-    retries: AtomicU32,
-
-    op: UploadOp,
-}
-
-#[derive(Debug)]
-enum UploadOp {
-    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
-
-    /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
-
-    /// Delete a file.
-    Delete(RemoteOpFileKind, LayerFileName),
-
-    /// Barrier. When the barrier operation is reached,
-    Barrier(tokio::sync::watch::Sender<()>),
-}
-
-impl std::fmt::Display for UploadOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            UploadOp::UploadLayer(path, metadata) => {
-                write!(
-                    f,
-                    "UploadLayer({}, size={:?})",
-                    path.file_name(),
-                    metadata.file_size()
-                )
-            }
-            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
-            UploadOp::Barrier(_) => write!(f, "Barrier"),
-        }
-    }
-}
-
 impl RemoteTimelineClient {
+    ///
+    /// Create a remote storage client for given timeline
+    ///
+    /// Note: the caller must initialize the upload queue before any uploads can be scheduled,
+    /// by calling init_upload_queue.
+    ///
+    pub fn new(
+        remote_storage: GenericRemoteStorage,
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<RemoteTimelineClient> {
+        Ok(RemoteTimelineClient {
+            conf,
+            runtime: &BACKGROUND_RUNTIME,
+            tenant_id,
+            timeline_id,
+            storage_impl: remote_storage,
+            upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
+        })
+    }
+
    /// Initialize the upload queue for a remote storage that already received
    /// an index file upload, i.e., it's not empty.
    /// The given `index_part` must be the one on the remote.
@@ -488,9 +344,9 @@ impl RemoteTimelineClient {
        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
            current_remote_index_part
                .layer_metadata
-                .iter()
+                .values()
                // If we don't have the file size for the layer, don't account for it in the metric.
-                .map(|(_, ilmd)| ilmd.file_size.unwrap_or(0))
+                .map(|ilmd| ilmd.file_size.unwrap_or(0))
                .sum()
        } else {
            0
@@ -498,6 +354,10 @@ impl RemoteTimelineClient {
        self.metrics.remote_physical_size_gauge().set(size);
    }

+    pub fn get_remote_physical_size(&self) -> u64 {
+        self.metrics.remote_physical_size_gauge().get()
+    }
+
    //
    // Download operations.
    //
@@ -558,7 +418,9 @@ impl RemoteTimelineClient {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
            if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
-                upgraded.merge(&new_metadata);
+                if upgraded.merge(&new_metadata) {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                }
                // If we don't do an index file upload inbetween here and restart,
                // the value will go back down after pageserver restart, since we will
                // have lost this data point.
@@ -583,14 +445,20 @@ impl RemoteTimelineClient {
    //

    ///
-    /// Launch an index-file upload operation in the background.
+    /// Launch an index-file upload operation in the background, with
+    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
    /// won't be performed until all previosuly scheduled layer file
    /// upload operations have completed successfully.  This is to
    /// ensure that when the index file claims that layers X, Y and Z
-    /// exist in remote storage, they really do.
-    pub fn schedule_index_upload(
+    /// exist in remote storage, they really do. To wait for the upload
+    /// to complete, use `wait_completion`.
+    ///
+    /// If there were any changes to the list of files, i.e. if any
+    /// layer file uploads were scheduled, since the last index file
+    /// upload, those will be included too.
+    pub fn schedule_index_upload_for_metadata_update(
        self: &Arc<Self>,
        metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -601,26 +469,60 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        self.schedule_index_upload(upload_queue, metadata_bytes);
+
+        Ok(())
+    }
+
+    ///
+    /// Launch an index-file upload operation in the background, if necessary.
+    ///
+    /// Use this function to schedule the update of the index file after
+    /// scheduling file uploads or deletions. If no file uploads or deletions
+    /// have been scheduled since the last index file upload, this does
+    /// nothing.
+    ///
+    /// Like schedule_index_upload_for_metadata_update(), this merely adds
+    /// the upload to the upload queue and returns quickly.
+    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+            self.schedule_index_upload(upload_queue, metadata_bytes);
+        }
+
+        Ok(())
+    }
+
+    /// Launch an index-file upload operation in the background (internal function)
+    fn schedule_index_upload(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        metadata_bytes: Vec<u8>,
+    ) {
+        info!(
+            "scheduling metadata upload with {} files ({} changed)",
+            upload_queue.latest_files.len(),
+            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
+        );
+
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();

        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            upload_queue.latest_metadata.to_bytes()?,
+            metadata_bytes,
        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.update_upload_queue_unfinished_metric(1, &op);
        upload_queue.queued_operations.push_back(op);
-
-        info!(
-            "scheduled metadata upload with {} files",
-            upload_queue.latest_files.len()
-        );
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
-
-        Ok(())
    }

    ///
@@ -644,6 +546,7 @@ impl RemoteTimelineClient {
        upload_queue
            .latest_files
            .insert(layer_file_name.clone(), layer_metadata.clone());
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
        self.update_upload_queue_unfinished_metric(1, &op);
@@ -662,8 +565,11 @@ impl RemoteTimelineClient {
    ///
    /// Launch a delete operation in the background.
    ///
-    /// The deletion won't actually be performed, until all preceding
-    /// upload operations have completed succesfully.
+    /// Note: This schedules an index file upload before the deletions.  The
+    /// deletion won't actually be performed, until any previously scheduled
+    /// upload operations, and the index file upload, have completed
+    /// succesfully.
+    ///
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
@@ -674,7 +580,6 @@ impl RemoteTimelineClient {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();

        // Update the remote index file, removing the to-be-deleted files from the index,
        // before deleting the actual files.
@@ -686,16 +591,12 @@ impl RemoteTimelineClient {
        let no_bail_here = || {
            for name in names {
                upload_queue.latest_files.remove(name);
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
            }

-            let index_part = IndexPart::new(
-                upload_queue.latest_files.clone(),
-                disk_consistent_lsn,
-                metadata_bytes,
-            );
-            let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-            self.update_upload_queue_unfinished_metric(1, &op);
-            upload_queue.queued_operations.push_back(op);
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+                self.schedule_index_upload(upload_queue, metadata_bytes);
+            }

            // schedule the actual deletions
            for name in names {
@@ -929,12 +830,14 @@ impl RemoteTimelineClient {
                Err(e) => {
                    let retries = task.retries.fetch_add(1, Ordering::SeqCst);

-                    // uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
-                    // such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
-                    // people and tests until the retries are definitely causing delays.
-                    if retries < 3 {
+                    // Uploads can fail due to rate limits (IAM, S3), spurious network problems,
+                    // or other external reasons. Such issues are relatively regular, so log them
+                    // at info level at first, and only WARN if the operation fails repeatedly.
+                    //
+                    // (See similar logic for downloads in `download::download_retry`)
+                    if retries < FAILED_UPLOAD_WARN_THRESHOLD {
                        info!(
-                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            "failed to perform remote task {}, will retry (attempt {}): {:#}",
                            task.op, retries, e
                        );
                    } else {
@@ -1077,29 +980,6 @@ impl RemoteTimelineClient {
    }
 }

-///
-/// Create a remote storage client for given timeline
-///
-/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
-/// by calling init_upload_queue.
-///
-pub fn create_remote_timeline_client(
-    remote_storage: GenericRemoteStorage,
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<RemoteTimelineClient> {
-    Ok(RemoteTimelineClient {
-        conf,
-        runtime: &BACKGROUND_RUNTIME,
-        tenant_id,
-        timeline_id,
-        storage_impl: remote_storage,
-        upload_queue: Mutex::new(UploadQueue::Uninitialized),
-        metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
-    })
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -1244,15 +1124,19 @@ mod tests {
            assert!(upload_queue.queued_operations.is_empty());
            assert!(upload_queue.inprogress_tasks.len() == 2);
            assert!(upload_queue.num_inprogress_layer_uploads == 2);
+
+            // also check that `latest_file_changes` was updated
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
        }

        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload(&metadata)?;
+        client.schedule_index_upload_for_metadata_update(&metadata)?;
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
            assert!(upload_queue.queued_operations.len() == 1);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
        }

        // Wait for the uploads to finish
@@ -1288,6 +1172,7 @@ mod tests {
            assert!(upload_queue.inprogress_tasks.len() == 1);
            assert!(upload_queue.num_inprogress_layer_uploads == 1);
            assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
        }
        assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);

--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ b/pageserver/src/tenant/remote_timeline_client/delete.rs
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -1,21 +1,27 @@
 //! Helper functions to download files from remote storage with a RemoteStorage
+//!
+//! The functions in this module retry failed operations automatically, according
+//! to the FAILED_DOWNLOAD_RETRIES constant.
+
 use std::collections::HashSet;
+use std::future::Future;
 use std::path::Path;

-use anyhow::{bail, Context};
+use anyhow::{anyhow, Context};
 use futures::stream::{FuturesUnordered, StreamExt};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{debug, info_span, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};

 use crate::config::PageServerConf;
-use crate::storage_sync::index::LayerFileMetadata;
-use crate::tenant::filename::LayerFileName;
+use crate::tenant::storage_layer::LayerFileName;
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

-use super::index::{IndexPart, IndexPartUnclean};
+use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
+use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};

 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
    fs::File::open(path).await?.sync_all().await
@@ -33,12 +39,14 @@ pub async fn download_layer_file<'a>(
    timeline_id: TimelineId,
    layer_file_name: &'a LayerFileName,
    layer_metadata: &'a LayerFileMetadata,
-) -> anyhow::Result<u64> {
+) -> Result<u64, DownloadError> {
    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);

    let local_path = timeline_path.join(layer_file_name.file_name());

-    let remote_path = conf.remote_path(&local_path)?;
+    let remote_path = conf
+        .remote_path(&local_path)
+        .map_err(DownloadError::Other)?;

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
@@ -52,21 +60,30 @@ pub async fn download_layer_file<'a>(
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);

-    // TODO: this doesn't use the cached fd for some reason?
-    let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
-        format!(
-            "Failed to create a destination file for layer '{}'",
-            temp_file_path.display()
-        )
-    })?;
-    let mut download = storage.download(&remote_path).await.with_context(|| {
-        format!(
-            "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
-        )
-    })?;
-    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
-        format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
-    })?;
+    let (mut destination_file, bytes_amount) = download_retry(
+        || async {
+            // TODO: this doesn't use the cached fd for some reason?
+            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
+                format!(
+                    "Failed to create a destination file for layer '{}'",
+                    temp_file_path.display()
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let mut download = storage.download(&remote_path).await.with_context(|| {
+                format!(
+                    "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+                format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok((destination_file, bytes_amount))
+        },
+        &format!("download {remote_path:?}"),
+    ).await?;

    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
    // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -76,19 +93,23 @@ pub async fn download_layer_file<'a>(
    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
    // But for additional safety lets check/wait for any pending operations.
-    destination_file.flush().await.with_context(|| {
-        format!(
-            "failed to flush source file at {}",
-            temp_file_path.display()
-        )
-    })?;
+    destination_file
+        .flush()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to flush source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;

    match layer_metadata.file_size() {
        Some(expected) if expected != bytes_amount => {
-            anyhow::bail!(
-                "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
+            return Err(DownloadError::Other(anyhow!(
+                "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
                temp_file_path.display()
-            );
+            )));
        }
        Some(_) | None => {
            // matches, or upgrading from an earlier IndexPart version
@@ -96,23 +117,38 @@ pub async fn download_layer_file<'a>(
    }

    // not using sync_data because it can lose file size update
-    destination_file.sync_all().await.with_context(|| {
-        format!(
-            "failed to fsync source file at {}",
-            temp_file_path.display()
-        )
-    })?;
+    destination_file
+        .sync_all()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to fsync source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;
    drop(destination_file);

    fail::fail_point!("remote-storage-download-pre-rename", |_| {
-        bail!("remote-storage-download-pre-rename failpoint triggered")
+        Err(DownloadError::Other(anyhow!(
+            "remote-storage-download-pre-rename failpoint triggered"
+        )))
    });

-    fs::rename(&temp_file_path, &local_path).await?;
+    fs::rename(&temp_file_path, &local_path)
+        .await
+        .with_context(|| {
+            format!(
+                "Could not rename download layer file to {}",
+                local_path.display(),
+            )
+        })
+        .map_err(DownloadError::Other)?;

    fsync_path(&local_path)
        .await
-        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
+        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
+        .map_err(DownloadError::Other)?;

    tracing::info!("download complete: {}", local_path.display());

@@ -143,14 +179,15 @@ pub async fn list_remote_timelines<'a>(
    let tenant_path = conf.timelines_path(&tenant_id);
    let tenant_storage_path = conf.remote_path(&tenant_path)?;

-    let timelines = storage
-        .list_prefixes(Some(&tenant_storage_path))
-        .await
-        .with_context(|| {
-            format!(
-                "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
-            )
-        })?;
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
+    let timelines = download_retry(
+        || storage.list_prefixes(Some(&tenant_storage_path)),
+        &format!("list prefixes for {tenant_path:?}"),
+    )
+    .await?;

    if timelines.is_empty() {
        anyhow::bail!("no timelines found on the remote storage")
@@ -209,16 +246,25 @@ pub async fn download_index_part(
        .remote_path(&index_part_path)
        .map_err(DownloadError::BadInput)?;

-    let mut index_part_download = storage.download(&part_storage_path).await?;
+    let index_part_bytes = download_retry(
+        || async {
+            let mut index_part_download = storage.download(&part_storage_path).await?;

-    let mut index_part_bytes = Vec::new();
-    tokio::io::copy(
-        &mut index_part_download.download_stream,
-        &mut index_part_bytes,
+            let mut index_part_bytes = Vec::new();
+            tokio::io::copy(
+                &mut index_part_download.download_stream,
+                &mut index_part_bytes,
+            )
+            .await
+            .with_context(|| {
+                format!("Failed to download an index part into file {index_part_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok(index_part_bytes)
+        },
+        &format!("download {part_storage_path:?}"),
    )
-    .await
-    .with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
-    .map_err(DownloadError::Other)?;
+    .await?;

    let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
        .with_context(|| {
@@ -230,3 +276,56 @@ pub async fn download_index_part(

    Ok(index_part)
 }
+
+///
+/// Helper function to handle retries for a download operation.
+///
+/// Remote operations can fail due to rate limits (IAM, S3), spurious network
+/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
+/// with backoff.
+///
+/// (See similar logic for uploads in `perform_upload_task`)
+async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+where
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, DownloadError>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
+                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
+                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(ref err)) => {
+                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
+                error!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use tracing::warn;

-use crate::tenant::{filename::LayerFileName, metadata::TimelineMetadata};
+use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName};

 use utils::lsn::Lsn;

@@ -48,9 +48,17 @@ impl LayerFileMetadata {
    /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
    /// other value.
    ///
-    /// This is called on the possibly outdated version.
-    pub fn merge(&mut self, other: &Self) {
-        self.file_size = other.file_size.or(self.file_size);
+    /// This is called on the possibly outdated version. Returns true if any changes
+    /// were made.
+    pub fn merge(&mut self, other: &Self) -> bool {
+        let mut changed = false;
+
+        if self.file_size != other.file_size {
+            self.file_size = other.file_size.or(self.file_size);
+            changed = true;
+        }
+
+        changed
    }
 }

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,12 +5,12 @@ use fail::fail_point;
 use std::path::Path;
 use tokio::fs;

-use super::index::IndexPart;
-use crate::config::PageServerConf;
-use crate::storage_sync::LayerFileMetadata;
+use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};

+use super::index::LayerFileMetadata;
+
 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
    conf: &'static PageServerConf,
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

 use anyhow::Context;
+use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;

+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+
 use super::Tenant;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -67,6 +70,7 @@ pub(super) async fn gather_inputs(

    let timelines = tenant
        .refresh_gc_info()
+        .await
        .context("Failed to refresh gc_info before gathering inputs")?;

    if timelines.is_empty() {
@@ -93,8 +97,6 @@ pub(super) async fn gather_inputs(
    // used to determine the `retention_period` for the size model
    let mut max_cutoff_distance = None;

-    // this will probably conflict with on-demand downloaded layers, or at least force them all
-    // to be downloaded
    for timeline in timelines {
        let last_record_lsn = timeline.get_last_record_lsn();

@@ -212,11 +214,30 @@ pub(super) async fn gather_inputs(
    let mut have_any_error = false;

    while let Some(res) = joinset.join_next().await {
-        // each of these come with Result<Result<_, JoinError>, JoinError>
+        // each of these come with Result<anyhow::Result<_>, JoinError>
        // because of spawn + spawn_blocking
-        let res = res.and_then(|inner| inner);
        match res {
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures, nor should be");
+            }
+            Err(join_error) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
+                have_any_error = true;
+            }
+            Ok(Err(recv_result_error)) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("failed to receive logical size query result: {recv_result_error:#}");
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");

                logical_size_cache.insert((timeline.timeline_id, lsn), size);
@@ -228,21 +249,6 @@ pub(super) async fn gather_inputs(
                    command: Command::Update(size),
                });
            }
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
-                warn!(
-                    timeline_id=%timeline.timeline_id,
-                    "failed to calculate logical size at {lsn}: {error:#}"
-                );
-                have_any_error = true;
-            }
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("we are not cancelling any of the futures, nor should be");
-            }
-            Err(join_error) => {
-                // cannot really do anything, as this panic is likely a bug
-                error!("logical size query panicked: {join_error:#}");
-                have_any_error = true;
-            }
        }
    }

@@ -351,7 +357,7 @@ enum LsnKind {
 struct TimelineAtLsnSizeResult(
    Arc<crate::tenant::Timeline>,
    utils::lsn::Lsn,
-    anyhow::Result<u64>,
+    Result<u64, CalculateLogicalSizeError>,
 );

 #[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
@@ -359,17 +365,15 @@ async fn calculate_logical_size(
    limit: Arc<tokio::sync::Semaphore>,
    timeline: Arc<crate::tenant::Timeline>,
    lsn: utils::lsn::Lsn,
-) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
-    let permit = tokio::sync::Semaphore::acquire_owned(limit)
+) -> Result<TimelineAtLsnSizeResult, RecvError> {
+    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
        .await
        .expect("global semaphore should not had been closed");

-    tokio::task::spawn_blocking(move || {
-        let _permit = permit;
-        let size_res = timeline.calculate_logical_size(lsn);
-        TimelineAtLsnSizeResult(timeline, lsn, size_res)
-    })
-    .await
+    let size_res = timeline
+        .spawn_ondemand_logical_size_calculation(lsn)
+        .await?;
+    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }

 #[test]
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,6 +1,10 @@
-//!
 //! Common traits and structs for layers
-//!
+
+mod delta_layer;
+mod filename;
+mod image_layer;
+mod inmemory_layer;
+mod remote_layer;

 use crate::repository::{Key, Value};
 use crate::walrecord::NeonWalRecord;
@@ -8,13 +12,19 @@ use anyhow::Result;
 use bytes::Bytes;
 use std::ops::Range;
 use std::path::PathBuf;
+use std::sync::Arc;

 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

-use super::filename::LayerFileName;
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf};
+pub use image_layer::{ImageLayer, ImageLayerWriter};
+pub use inmemory_layer::InMemoryLayer;
+pub use remote_layer::RemoteLayer;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
    T: PartialOrd<T>,
@@ -116,6 +126,12 @@ pub trait Layer: Send + Sync {
    fn dump(&self, verbose: bool) -> Result<()>;
 }

+/// Returned by [`Layer::iter`]
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+
+/// Returned by [`Layer::key_iter`]
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -141,17 +157,42 @@ pub trait PersistentLayer: Layer {
    fn filename(&self) -> LayerFileName;

    // Path to the layer file in the local filesystem.
-    fn local_path(&self) -> PathBuf;
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<PathBuf>;

    /// Iterate through all keys and values stored in the layer
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
+    fn iter(&self) -> Result<LayerIter<'_>>;

    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
    /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
        panic!("Not implemented")
    }

    /// Permanently remove this layer from disk.
    fn delete(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    /// Returns None if the layer file size is not known.
+    ///
+    /// Should not change over the lifetime of the layer object because
+    /// current_physical_size is computed as the som of this value.
+    fn file_size(&self) -> Option<u64>;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,7 +29,6 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{DeltaFileName, PathOrConf};
 use crate::tenant::storage_layer::{
    PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -39,7 +38,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
@@ -54,8 +53,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf};

 ///
 /// Header stored in the beginning of the file
@@ -183,6 +181,8 @@ pub struct DeltaLayer {
    pub key_range: Range<Key>,
    pub lsn_range: Range<Lsn>,

+    pub file_size: u64,
+
    inner: RwLock<DeltaLayerInner>,
 }

@@ -387,32 +387,23 @@ impl PersistentLayer for DeltaLayer {
        self.layer_name().into()
    }

-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
    }

-    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaValueIter::new(inner) {
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        let inner = self.load().context("load delta layer")?;
+        Ok(match DeltaValueIter::new(inner) {
            Ok(iter) => Box::new(iter),
            Err(err) => Box::new(std::iter::once(Err(err))),
-        }
+        })
    }

-    fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaKeyIter::new(inner) {
-            Ok(iter) => Box::new(iter),
-            Err(e) => panic!("Layer index is corrupted: {e:?}"),
-        }
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        let inner = self.load()?;
+        Ok(Box::new(
+            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
+        ))
    }

    fn delete(&self) -> Result<()> {
@@ -420,6 +411,10 @@ impl PersistentLayer for DeltaLayer {
        fs::remove_file(self.path())?;
        Ok(())
    }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }

 impl DeltaLayer {
@@ -544,6 +539,7 @@ impl DeltaLayer {
        timeline_id: TimelineId,
        tenant_id: TenantId,
        filename: &DeltaFileName,
+        file_size: u64,
    ) -> DeltaLayer {
        DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
@@ -551,6 +547,7 @@ impl DeltaLayer {
            tenant_id,
            key_range: filename.key_range.clone(),
            lsn_range: filename.lsn_range.clone(),
+            file_size,
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                file: None,
@@ -563,21 +560,23 @@ impl DeltaLayer {
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
-    where
-        F: FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
        let mut summary_buf = Vec::new();
        summary_buf.resize(PAGE_SZ, 0);
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;

+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
        Ok(DeltaLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            timeline_id: summary.timeline_id,
            tenant_id: summary.tenant_id,
            key_range: summary.key_range,
            lsn_range: summary.lsn_range,
+            file_size: metadata.len(),
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                file: None,
@@ -734,6 +733,10 @@ impl DeltaLayerWriterInner {
        file.seek(SeekFrom::Start(0))?;
        Summary::ser_into(&summary, &mut file)?;

+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -743,6 +746,7 @@ impl DeltaLayerWriterInner {
            timeline_id: self.timeline_id,
            key_range: self.key_start..key_end,
            lsn_range: self.lsn_range.clone(),
+            file_size: metadata.len(),
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                file: None,
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -21,11 +21,10 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::page_cache::PAGE_SZ;
-use crate::repository::{Key, Value, KEY_SIZE};
+use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{ImageFileName, PathOrConf};
 use crate::tenant::storage_layer::{
    PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -36,10 +35,11 @@ use bytes::Bytes;
 use hex;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::Write;
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
+use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;
@@ -50,8 +50,8 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::filename::{ImageFileName, LayerFileName, PathOrConf};
+use super::{Layer, LayerIter};

 ///
 /// Header stored in the beginning of the file
@@ -105,6 +105,7 @@ pub struct ImageLayer {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub key_range: Range<Key>,
+    pub file_size: u64,

    // This entry contains an image of all pages as of this LSN
    pub lsn: Lsn,
@@ -208,8 +209,8 @@ impl PersistentLayer for ImageLayer {
        self.layer_name().into()
    }

-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
    }

    fn get_tenant_id(&self) -> TenantId {
@@ -219,7 +220,7 @@ impl PersistentLayer for ImageLayer {
    fn get_timeline_id(&self) -> TimelineId {
        self.timeline_id
    }
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+    fn iter(&self) -> Result<LayerIter<'_>> {
        unimplemented!();
    }

@@ -228,6 +229,10 @@ impl PersistentLayer for ImageLayer {
        fs::remove_file(self.path())?;
        Ok(())
    }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }

 impl ImageLayer {
@@ -344,6 +349,7 @@ impl ImageLayer {
        timeline_id: TimelineId,
        tenant_id: TenantId,
        filename: &ImageFileName,
+        file_size: u64,
    ) -> ImageLayer {
        ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
@@ -351,6 +357,7 @@ impl ImageLayer {
            tenant_id,
            key_range: filename.key_range.clone(),
            lsn: filename.lsn,
+            file_size,
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
                file: None,
@@ -363,21 +370,21 @@ impl ImageLayer {
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
-    where
-        F: std::os::unix::prelude::FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
        let mut summary_buf = Vec::new();
        summary_buf.resize(PAGE_SZ, 0);
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;
-
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            timeline_id: summary.timeline_id,
            tenant_id: summary.tenant_id,
            key_range: summary.key_range,
            lsn: summary.lsn,
+            file_size: metadata.len(),
            inner: RwLock::new(ImageLayerInner {
                file: None,
                loaded: false,
@@ -523,6 +530,10 @@ impl ImageLayerWriterInner {
        file.seek(SeekFrom::Start(0))?;
        Summary::ser_into(&summary, &mut file)?;

+        let metadata = file
+            .metadata()
+            .context("get metadata to determine file size")?;
+
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -532,6 +543,7 @@ impl ImageLayerWriterInner {
            tenant_id: self.tenant_id,
            key_range: self.key_range.clone(),
            lsn: self.lsn,
+            file_size: metadata.len(),
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
                file: None,
@@ -556,7 +568,7 @@ impl ImageLayerWriterInner {
                lsn: self.lsn,
            },
        );
-        std::fs::rename(self.path, &final_path)?;
+        std::fs::rename(self.path, final_path)?;

        trace!("created image layer {}", layer.path().display());

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -8,7 +8,6 @@ use crate::config::PageServerConf;
 use crate::repository::{Key, Value};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
-use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
@@ -28,7 +27,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use std::sync::RwLock;

-use super::storage_layer::Layer;
+use super::{DeltaLayer, DeltaLayerWriter, Layer};

 thread_local! {
    /// A buffer for serializing object during [`InMemoryLayer::put_value`].
@@ -97,6 +96,7 @@ impl Layer for InMemoryLayer {
        };
        self.start_lsn..end_lsn
    }
+
    fn is_incremental(&self) -> bool {
        // in-memory layer is always considered incremental.
        true
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -0,0 +1,210 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use anyhow::{bail, Result};
+use std::ops::Range;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
+use super::image_layer::ImageLayer;
+use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer};
+
+#[derive(Debug)]
+pub struct RemoteLayer {
+    tenantid: TenantId,
+    timelineid: TimelineId,
+    key_range: Range<Key>,
+    lsn_range: Range<Lsn>,
+
+    pub file_name: LayerFileName,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    is_delta: bool,
+
+    is_incremental: bool,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+}
+
+impl Layer for RemoteLayer {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        bail!(
+            "layer {} needs to be downloaded",
+            self.filename().file_name()
+        );
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    /// debugging function to print out the contents of the layer
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        println!(
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenantid,
+            self.timelineid,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
+        );
+
+        Ok(())
+    }
+
+    fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn get_tenant_id(&self) -> TenantId {
+        self.tenantid
+    }
+
+    fn get_timeline_id(&self) -> TimelineId {
+        self.timelineid
+    }
+
+    fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            }
+            .into()
+        } else {
+            ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            }
+            .into()
+        }
+    }
+
+    fn local_path(&self) -> Option<PathBuf> {
+        None
+    }
+
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn delete(&self) -> Result<()> {
+        Ok(())
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn file_size(&self) -> Option<u64> {
+        self.layer_metadata.file_size()
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn..(fname.lsn + 1),
+            is_delta: false,
+            is_incremental: false,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn_range.clone(),
+            is_delta: true,
+            is_incremental: true,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub fn create_downloaded_layer(
+        &self,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.is_delta {
+            let fname = DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            };
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        } else {
+            let fname = ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            };
+            Arc::new(ImageLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        }
+    }
+}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,8 +8,8 @@ use std::time::Duration;
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_mgr;
 use tracing::*;
 use utils::id::TenantId;

@@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) {
            } else {
                // Run gc
                if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
                    {
                        sleep_duration = wait_duration;
                        error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
    wait: Duration,
 ) -> ControlFlow<(), Arc<Tenant>> {
    let tenant = loop {
-        match tenant_mgr::get_tenant(tenant_id, false).await {
+        match mgr::get_tenant(tenant_id, false).await {
            Ok(tenant) => break tenant,
            Err(e) => {
                error!("Failed to get a tenant {tenant_id}: {e:#}");
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -0,0 +1,213 @@
+use crate::metrics::RemoteOpFileKind;
+
+use super::storage_layer::LayerFileName;
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use std::collections::{HashMap, VecDeque};
+use std::fmt::Debug;
+
+use std::sync::Arc;
+use tracing::info;
+
+use std::sync::atomic::AtomicU32;
+use utils::lsn::Lsn;
+
+// clippy warns that Uninitialized is much smaller than Initialized, which wastes
+// memory for Uninitialized variants. Doesn't matter in practice, there are not
+// that many upload queues in a running pageserver, and most of them are initialized
+// anyway.
+#[allow(clippy::large_enum_variant)]
+pub(crate) enum UploadQueue {
+    Uninitialized,
+    Initialized(UploadQueueInitialized),
+    Stopped(UploadQueueStopped),
+}
+
+impl UploadQueue {
+    fn as_str(&self) -> &'static str {
+        match self {
+            UploadQueue::Uninitialized => "Uninitialized",
+            UploadQueue::Initialized(_) => "Initialized",
+            UploadQueue::Stopped(_) => "Stopped",
+        }
+    }
+}
+
+/// This keeps track of queued and in-progress tasks.
+pub(crate) struct UploadQueueInitialized {
+    /// Counter to assign task IDs
+    pub(crate) task_counter: u64,
+
+    /// All layer files stored in the remote storage, taking into account all
+    /// in-progress and queued operations
+    pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+
+    /// How many file uploads or deletions been scheduled, since the
+    /// last (scheduling of) metadata index upload?
+    pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
+
+    /// Metadata stored in the remote storage, taking into account all
+    /// in-progress and queued operations.
+    /// DANGER: do not return to outside world, e.g., safekeepers.
+    pub(crate) latest_metadata: TimelineMetadata,
+
+    /// `disk_consistent_lsn` from the last metadata file that was successfully
+    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
+    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
+    /// Safekeeper can rely on it to make decisions for WAL storage.
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+
+    // Breakdown of different kinds of tasks currently in-progress
+    pub(crate) num_inprogress_layer_uploads: usize,
+    pub(crate) num_inprogress_metadata_uploads: usize,
+    pub(crate) num_inprogress_deletions: usize,
+
+    /// Tasks that are currently in-progress. In-progress means that a tokio Task
+    /// has been launched for it. An in-progress task can be busy uploading, but it can
+    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
+    /// be waiting for retry in `exponential_backoff`.
+    pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
+
+    /// Queued operations that have not been launched yet. They might depend on previous
+    /// tasks to finish. For example, metadata upload cannot be performed before all
+    /// preceding layer file uploads have completed.
+    pub(crate) queued_operations: VecDeque<UploadOp>,
+}
+
+pub(crate) struct UploadQueueStopped {
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+}
+
+impl UploadQueue {
+    pub(crate) fn initialize_empty_remote(
+        &mut self,
+        metadata: &TimelineMetadata,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        info!("initializing upload queue for empty remote");
+
+        let state = UploadQueueInitialized {
+            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
+            latest_files: HashMap::new(),
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: metadata.clone(),
+            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
+            // safekeepers from garbage-collecting anything.
+            last_uploaded_consistent_lsn: Lsn(0),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialize_with_current_remote_index_part(
+        &mut self,
+        index_part: &IndexPart,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
+        for layer_name in &index_part.timeline_layers {
+            let layer_metadata = index_part
+                .layer_metadata
+                .get(layer_name)
+                .map(LayerFileMetadata::from)
+                .unwrap_or(LayerFileMetadata::MISSING);
+            files.insert(layer_name.to_owned(), layer_metadata);
+        }
+
+        let index_part_metadata = index_part.parse_metadata()?;
+        info!(
+            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
+            index_part_metadata.disk_consistent_lsn()
+        );
+
+        let state = UploadQueueInitialized {
+            latest_files: files,
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: index_part_metadata.clone(),
+            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Initialized(x) => Ok(x),
+        }
+    }
+}
+
+/// An in-progress upload or delete task.
+#[derive(Debug)]
+pub(crate) struct UploadTask {
+    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
+    pub(crate) task_id: u64,
+    pub(crate) retries: AtomicU32,
+
+    pub(crate) op: UploadOp,
+}
+
+#[derive(Debug)]
+pub(crate) enum UploadOp {
+    /// Upload a layer file
+    UploadLayer(LayerFileName, LayerFileMetadata),
+
+    /// Upload the metadata file
+    UploadMetadata(IndexPart, Lsn),
+
+    /// Delete a file.
+    Delete(RemoteOpFileKind, LayerFileName),
+
+    /// Barrier. When the barrier operation is reached,
+    Barrier(tokio::sync::watch::Sender<()>),
+}
+
+impl std::fmt::Display for UploadOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            UploadOp::UploadLayer(path, metadata) => {
+                write!(
+                    f,
+                    "UploadLayer({}, size={:?})",
+                    path.file_name(),
+                    metadata.file_size()
+                )
+            }
+            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
+            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Barrier(_) => write!(f, "Barrier"),
+        }
+    }
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -12,7 +12,7 @@
 //!
 use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
 use once_cell::sync::OnceCell;
-use std::fs::{File, OpenOptions};
+use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -240,6 +240,10 @@ impl VirtualFile {
        self.with_file("fsync", |file| file.sync_all())?
    }

+    pub fn metadata(&self) -> Result<fs::Metadata, Error> {
+        self.with_file("metadata", |file| file.metadata())?
+    }
+
    /// Helper function that looks up the underlying File for this VirtualFile,
    /// opening it and evicting some other File if necessary. It calls 'func'
    /// with the physical File.
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
--- a/Show More
+++ b/Show More