diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
new file mode 100644
index 0000000000..3f32b80ca8
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -0,0 +1,10 @@
+## Describe your changes
+
+## Issue ticket number and link
+
+## Checklist before requesting a review
+- [ ] I have performed a self-review of my code.
+- [ ] If it is a core feature, I have added thorough tests.
+- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
+- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
+
diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
index 8fcc3bd4af..a848077e6a 100644
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -14,7 +14,7 @@
 - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
 - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
 - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
-- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
-- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
+- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
+- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
 
 <!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->
diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml
deleted file mode 100644
index 5d5df5a6d5..0000000000
--- a/.github/ansible/neon-stress.hosts.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-storage-ireland
-    bucket_region: eu-west-1
-    console_mgmt_base_url: http://neon-stress-console.local
-    broker_endpoint: http://storage-broker.neon-stress.local:50051
-    safekeeper_enable_s3_offload: 'false'
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: neon-stress/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-  children:
-    pageservers:
-      hosts:
-        neon-stress-ps-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-ps-2:
-          console_region_id: aws-eu-west-1
-    safekeepers:
-      hosts:
-        neon-stress-sk-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-2:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-3:
-          console_region_id: aws-eu-west-1
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index cfcc3a9ae8..fce450ed39 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -6,6 +6,8 @@ storage:
     broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"
diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml
deleted file mode 100644
index 79acfd1d2a..0000000000
--- a/.github/ansible/staging.hosts.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-storage:
-  vars:
-    bucket_name: zenith-staging-storage-us-east-1
-    bucket_region: us-east-1
-    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: http://storage-broker.staging.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: us-stage/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-
-  children:
-    pageservers:
-      hosts:
-        zenith-us-stage-ps-2:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-3:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-4:
-          console_region_id: aws-us-east-1
-
-    safekeepers:
-      hosts:
-        zenith-us-stage-sk-4:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-5:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-6:
-          console_region_id: aws-us-east-1
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 78a4582e57..1d1b8dbfa4 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -6,6 +6,8 @@ storage:
     broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"
@@ -25,6 +27,8 @@ storage:
           ansible_host: i-0c3e70929edb5d691
         pageserver-1.us-east-2.aws.neon.build:
           ansible_host: i-0565a8b4008aa3f40
+        pageserver-2.us-east-2.aws.neon.build:
+          ansible_host: i-01e31cdf7e970586a
 
     safekeepers:
       hosts:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index ae9c1f2e40..08304503c5 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index a2f932e4fb..be0fc329c9 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index 1138536e94..b7f712585b 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/neon-stress.neon-storage-broker.yaml b/.github/helm-values/neon-stress.neon-storage-broker.yaml
deleted file mode 100644
index e11e5d4214..0000000000
--- a/.github/helm-values/neon-stress.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: neon-stress
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
deleted file mode 100644
index ed580349fc..0000000000
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-fullnameOverride: "neon-stress-proxy-scram"
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://neon-stress-console.local/management/api/v2"
-  domain: "*.stress.neon.tech"
-  sentryEnvironment: "development"
-
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
deleted file mode 100644
index 94270ced09..0000000000
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-fullnameOverride: "neon-stress-proxy"
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.dev.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-service:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
-  type: LoadBalancer
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index 4e4aff1f9e..e9e89aff7c 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 94290a87e1..5366ba4ae5 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 1a4023708b..e71e457f13 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 2942d6a2aa..9afe94edd1 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index c7143cd61a..8143f7e575 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -3,6 +3,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.cloud.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 podLabels:
   zenith_service: proxy-scram
@@ -16,6 +17,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
+  httpsPort: 443
 
 metrics:
   enabled: true
diff --git a/.github/helm-values/staging.neon-storage-broker.yaml b/.github/helm-values/staging.neon-storage-broker.yaml
deleted file mode 100644
index 6b21c286a1..0000000000
--- a/.github/helm-values/staging.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: staging
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
deleted file mode 100644
index 66f9921c9a..0000000000
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
deleted file mode 100644
index a22082e625..0000000000
--- a/.github/helm-values/staging.proxy.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.stage.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index e3e0f1e820..59317f0a47 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -18,6 +18,7 @@ on:
       region_id:
         description: 'Use a particular region. If not set the default region will be used'
         required: false
+        default: 'aws-us-east-2'
       save_perf_report:
         type: boolean
         description: 'Publish perf report or not. If not set, the report is published only for the main branch'
@@ -115,13 +116,10 @@ jobs:
         # neon-captest-prefetch: Same, with prefetching enabled (new project)
         # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
         # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
+        platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
         db_size: [ 10gb ]
         runner: [ us-east-2 ]
         include:
-          - platform: neon-captest-new
-            db_size: 50gb
-            runner: us-east-2
           - platform: neon-captest-prefetch
             db_size: 50gb
             runner: us-east-2
@@ -409,7 +407,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     timeout-minutes: 360 # 6h
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 43b855a2b0..1bbba8e3fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -111,6 +111,7 @@ jobs:
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
         run: make postgres-headers -j$(nproc)
+
       - name: Run cargo clippy
         run: ./run_clippy.sh
 
@@ -126,6 +127,11 @@ jobs:
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
 
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
   build-neon:
     runs-on: [ self-hosted, dev, x64 ]
     container:
@@ -177,13 +183,12 @@ jobs:
       # corresponding Cargo.toml files for their descriptions.
       - name: Set env variables
         run: |
+          CARGO_FEATURES="--features testing"
           if [[ $BUILD_TYPE == "debug" ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FEATURES="--features testing"
             CARGO_FLAGS="--locked $CARGO_FEATURES"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
-            CARGO_FEATURES="--features testing,profiling"
             CARGO_FLAGS="--locked --release $CARGO_FEATURES"
           fi
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
@@ -555,10 +560,14 @@ jobs:
       - name: Kaniko build compute tools
         run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
 
-  compute-node-image-v14:
+  compute-node-image:
     runs-on: [ self-hosted, dev, x64 ]
     container: gcr.io/kaniko-project/executor:v1.9.0-debug
     needs: [ tag ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
     defaults:
       run:
         shell: sh -eu {0}
@@ -573,32 +582,40 @@ jobs:
       - name: Configure ECR login
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
-      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
+      - name: Kaniko build compute node with extensions
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-  compute-node-image-v15:
+  vm-compute-node-image:
     runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
+    needs: [ tag, compute-node-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
     defaults:
       run:
         shell: sh -eu {0}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
+      - name: Downloading latest vm-builder
+        run: |
+          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          chmod +x vm-builder
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Pulling compute-node image
+        run: |
+          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
+      - name: Build vm image
+        run: |
+          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Pushing vm-compute-node image
+        run: |
+          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
   test-images:
-    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
     runs-on: [ self-hosted, dev, x64 ]
 
     steps:
@@ -642,13 +659,13 @@ jobs:
 
   promote-images:
     runs-on: [ self-hosted, dev, x64 ]
-    needs: [ tag, test-images ]
+    needs: [ tag, test-images, vm-compute-node-image ]
     if: github.event_name != 'workflow_dispatch'
     container: amazon/aws-cli
     strategy:
       fail-fast: false
       matrix:
-        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
 
     steps:
       - name: Promote image to latest
@@ -681,9 +698,15 @@ jobs:
       - name: Pull compute node v14 image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
 
+      - name: Pull vm compute node v14 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+
       - name: Pull compute node v15 image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
 
+      - name: Pull vm compute node v15 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+
       - name: Pull rust image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
 
@@ -695,7 +718,9 @@ jobs:
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
 
       - name: Configure Docker Hub login
         run: |
@@ -712,9 +737,15 @@ jobs:
       - name: Push compute node v14 image to Docker Hub
         run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
 
+      - name: Push vm compute node v14 image to Docker Hub
+        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+
       - name: Push compute node v15 image to Docker Hub
         run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
 
+      - name: Push vm compute node v15 image to Docker Hub
+        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+
       - name: Push rust image to Docker Hub
         run: crane push rust neondatabase/rust:pinned
 
@@ -726,26 +757,25 @@ jobs:
           crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
   calculate-deploy-targets:
     runs-on: [ self-hosted, dev, x64 ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     outputs:
       matrix-include: ${{ steps.set-matrix.outputs.include }}
     steps:
       - id: set-matrix
         run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            echo "include=[$STAGING]" >> $GITHUB_OUTPUT
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
             exit 1
           fi
 
@@ -756,7 +786,7 @@ jobs:
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -764,6 +794,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -800,7 +832,7 @@ jobs:
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
     if: |
       (github.ref_name == 'main') &&
       github.event_name != 'workflow_dispatch'
@@ -809,7 +841,9 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        target_region: [ us-east-2 ]
+        target_region: [ eu-west-1, us-east-2 ]
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -881,6 +915,8 @@ jobs:
     strategy:
       matrix:
         target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -912,7 +948,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -920,6 +956,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     env:
       KUBECONFIG: .kubeconfig
     steps:
@@ -945,8 +983,8 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker:
     name: deploy storage broker on old staging and old prod
@@ -955,7 +993,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -963,6 +1001,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     env:
       KUBECONFIG: .kubeconfig
     steps:
@@ -1011,6 +1051,8 @@ jobs:
             target_cluster: dev-eu-west-1-zeta
             deploy_link_proxy: false
             deploy_legacy_scram_proxy: false
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1026,19 +1068,19 @@ jobs:
       - name: Re-deploy scram proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy link proxy
         if: matrix.deploy_link_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy legacy scram proxy
         if: matrix.deploy_legacy_scram_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker-dev-new:
     runs-on: [ self-hosted, dev, x64 ]
@@ -1058,6 +1100,8 @@ jobs:
             target_cluster: dev-us-east-2-beta
           - target_region:  eu-west-1
             target_cluster: dev-eu-west-1-zeta
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1096,6 +1140,8 @@ jobs:
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1
             target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1111,7 +1157,7 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker-prod-new:
     runs-on: prod
@@ -1135,6 +1181,8 @@ jobs:
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1
             target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
diff --git a/Cargo.lock b/Cargo.lock
index 913b39da0f..1649e28faa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -66,12 +66,6 @@ dependencies = [
  "backtrace",
 ]
 
-[[package]]
-name = "arrayvec"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
-
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -563,6 +557,12 @@ version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
 
+[[package]]
+name = "base64"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -627,12 +627,6 @@ version = "3.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
 
-[[package]]
-name = "bytemuck"
-version = "1.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f"
-
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -893,7 +887,7 @@ dependencies = [
  "clap 4.0.29",
  "comfy-table",
  "git-version",
- "nix 0.25.1",
+ "nix",
  "once_cell",
  "pageserver_api",
  "postgres",
@@ -928,15 +922,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
 
-[[package]]
-name = "cpp_demangle"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f"
-dependencies = [
- "cfg-if",
-]
-
 [[package]]
 name = "cpufeatures"
 version = "0.2.5"
@@ -1060,7 +1045,7 @@ dependencies = [
  "crossterm_winapi",
  "libc",
  "mio",
- "parking_lot 0.12.1",
+ "parking_lot",
  "signal-hook",
  "signal-hook-mio",
  "winapi",
@@ -1170,15 +1155,6 @@ version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
 
-[[package]]
-name = "debugid"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730"
-dependencies = [
- "uuid 0.8.2",
-]
-
 [[package]]
 name = "debugid"
 version = "0.8.0"
@@ -1186,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
 dependencies = [
  "serde",
- "uuid 1.2.2",
+ "uuid",
 ]
 
 [[package]]
@@ -1312,18 +1288,6 @@ dependencies = [
  "windows-sys 0.42.0",
 ]
 
-[[package]]
-name = "findshlibs"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
-dependencies = [
- "cc",
- "lazy_static",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "fixedbitset"
 version = "0.4.2"
@@ -1336,21 +1300,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -1752,16 +1701,16 @@ dependencies = [
 ]
 
 [[package]]
-name = "hyper-tls"
-version = "0.5.0"
+name = "hyper-tungstenite"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
+checksum = "d62004bcd4f6f85d9e2aa4206f1466ee67031f5ededcb6c6e62d48f9306ad879"
 dependencies = [
- "bytes",
  "hyper",
- "native-tls",
+ "pin-project",
  "tokio",
- "tokio-native-tls",
+ "tokio-tungstenite",
+ "tungstenite",
 ]
 
 [[package]]
@@ -1815,24 +1764,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "inferno"
-version = "0.10.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f"
-dependencies = [
- "ahash",
- "atty",
- "indexmap",
- "itoa",
- "lazy_static",
- "log",
- "num-format",
- "quick-xml",
- "rgb",
- "str_stack",
-]
-
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -1920,7 +1851,7 @@ version = "8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "pem",
  "ring",
  "serde",
@@ -2059,15 +1990,6 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
-[[package]]
-name = "memmap2"
-version = "0.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.6.5"
@@ -2135,37 +2057,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
-[[package]]
-name = "native-tls"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
-[[package]]
-name = "nix"
-version = "0.23.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c"
-dependencies = [
- "bitflags",
- "cc",
- "cfg-if",
- "libc",
- "memoffset 0.6.5",
-]
-
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -2229,16 +2120,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "num-format"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
-dependencies = [
- "arrayvec",
- "itoa",
-]
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -2299,51 +2180,12 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
-[[package]]
-name = "openssl"
-version = "0.10.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29d971fd5722fec23977260f6e81aa67d2f22cadbdc2aa049f1022d9a3be1566"
-dependencies = [
- "bitflags",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
-[[package]]
-name = "openssl-sys"
-version = "0.9.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5454462c0eced1e97f2ec09036abc8da362e66802f66fd20f86854d9d8cbcbc4"
-dependencies = [
- "autocfg",
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "os_info"
 version = "3.5.1"
@@ -2394,7 +2236,7 @@ dependencies = [
  "hyper",
  "itertools",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "num-traits",
  "once_cell",
  "pageserver_api",
@@ -2404,11 +2246,11 @@ dependencies = [
  "postgres-types",
  "postgres_connection",
  "postgres_ffi",
- "pprof",
  "pq_proto",
  "rand",
  "regex",
  "remote_storage",
+ "reqwest",
  "rstar",
  "scopeguard",
  "serde",
@@ -2417,12 +2259,12 @@ dependencies = [
  "signal-hook",
  "storage_broker",
  "svg_fmt",
- "tar",
  "tempfile",
  "tenant_size_model",
  "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-tar",
  "tokio-util",
  "toml_edit",
  "tracing",
@@ -2447,17 +2289,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "parking_lot"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
-dependencies = [
- "instant",
- "lock_api",
- "parking_lot_core 0.8.5",
-]
-
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2465,21 +2296,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
  "lock_api",
- "parking_lot_core 0.9.5",
-]
-
-[[package]]
-name = "parking_lot_core"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
-dependencies = [
- "cfg-if",
- "instant",
- "libc",
- "redox_syscall",
- "smallvec",
- "winapi",
+ "parking_lot_core",
 ]
 
 [[package]]
@@ -2507,7 +2324,7 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]
 
 [[package]]
@@ -2528,18 +2345,18 @@ dependencies = [
 
 [[package]]
 name = "phf"
-version = "0.10.1"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
+checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
 dependencies = [
  "phf_shared",
 ]
 
 [[package]]
 name = "phf_shared"
-version = "0.10.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
+checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
 dependencies = [
  "siphasher",
 ]
@@ -2576,12 +2393,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
-[[package]]
-name = "pkg-config"
-version = "0.3.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
-
 [[package]]
 name = "plotters"
 version = "0.3.4"
@@ -2612,12 +2423,12 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.2"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+version = "0.19.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "bytes",
  "fallible-iterator",
- "futures",
+ "futures-util",
  "log",
  "tokio",
  "tokio-postgres",
@@ -2626,9 +2437,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
- "base64",
+ "base64 0.20.0",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -2643,8 +2454,8 @@ dependencies = [
 
 [[package]]
 name = "postgres-types"
-version = "0.2.3"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+version = "0.2.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2688,25 +2499,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "pprof"
-version = "0.6.1"
-source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9"
-dependencies = [
- "backtrace",
- "cfg-if",
- "findshlibs",
- "inferno",
- "lazy_static",
- "libc",
- "log",
- "nix 0.23.2",
- "parking_lot 0.11.2",
- "symbolic-demangle",
- "tempfile",
- "thiserror",
-]
-
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -2723,6 +2515,7 @@ dependencies = [
  "postgres-protocol",
  "rand",
  "serde",
+ "thiserror",
  "tokio",
  "tracing",
  "workspace_hack",
@@ -2801,7 +2594,7 @@ dependencies = [
  "lazy_static",
  "libc",
  "memchr",
- "parking_lot 0.12.1",
+ "parking_lot",
  "procfs",
  "thiserror",
 ]
@@ -2868,7 +2661,7 @@ dependencies = [
  "anyhow",
  "async-trait",
  "atty",
- "base64",
+ "base64 0.13.1",
  "bstr",
  "bytes",
  "clap 4.0.29",
@@ -2878,15 +2671,17 @@ dependencies = [
  "hex",
  "hmac",
  "hyper",
+ "hyper-tungstenite",
  "itertools",
  "md5",
  "metrics",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "pin-project-lite",
  "pq_proto",
  "rand",
  "rcgen",
+ "regex",
  "reqwest",
  "routerify",
  "rstest",
@@ -2898,6 +2693,7 @@ dependencies = [
  "sha2",
  "socket2",
  "thiserror",
+ "tls-listener",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
@@ -2906,20 +2702,12 @@ dependencies = [
  "tracing-subscriber",
  "url",
  "utils",
- "uuid 1.2.2",
+ "uuid",
+ "webpki-roots",
  "workspace_hack",
  "x509-parser",
 ]
 
-[[package]]
-name = "quick-xml"
-version = "0.22.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "quote"
 version = "1.0.21"
@@ -3078,7 +2866,7 @@ version = "0.11.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "bytes",
  "encoding_rs",
  "futures-core",
@@ -3088,12 +2876,10 @@ dependencies = [
  "http-body",
  "hyper",
  "hyper-rustls",
- "hyper-tls",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -3103,7 +2889,6 @@ dependencies = [
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-native-tls",
  "tokio-rustls",
  "tower-service",
  "url",
@@ -3114,15 +2899,6 @@ dependencies = [
  "winreg",
 ]
 
-[[package]]
-name = "rgb"
-version = "0.8.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3"
-dependencies = [
- "bytemuck",
-]
-
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3261,7 +3037,7 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]
 
 [[package]]
@@ -3303,9 +3079,9 @@ dependencies = [
  "humantime",
  "hyper",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "postgres",
  "postgres-protocol",
  "postgres_ffi",
@@ -3417,14 +3193,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
 dependencies = [
  "httpdate",
- "native-tls",
  "reqwest",
+ "rustls",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
  "sentry-panic",
  "tokio",
  "ureq",
+ "webpki-roots",
 ]
 
 [[package]]
@@ -3482,7 +3259,7 @@ version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6"
 dependencies = [
- "debugid 0.8.0",
+ "debugid",
  "getrandom",
  "hex",
  "serde",
@@ -3490,7 +3267,7 @@ dependencies = [
  "thiserror",
  "time",
  "url",
- "uuid 1.2.2",
+ "uuid",
 ]
 
 [[package]]
@@ -3542,7 +3319,7 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "chrono",
  "hex",
  "indexmap",
@@ -3564,6 +3341,17 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "sha-1"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "sha1"
 version = "0.10.5"
@@ -3712,7 +3500,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "prost",
  "tokio",
  "tokio-stream",
@@ -3723,12 +3511,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "str_stack"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
-
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -3776,29 +3558,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
 
-[[package]]
-name = "symbolic-common"
-version = "8.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540"
-dependencies = [
- "debugid 0.7.3",
- "memmap2",
- "stable_deref_trait",
- "uuid 0.8.2",
-]
-
-[[package]]
-name = "symbolic-demangle"
-version = "8.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750"
-dependencies = [
- "cpp_demangle",
- "rustc-demangle",
- "symbolic-common",
-]
-
 [[package]]
 name = "syn"
 version = "1.0.105"
@@ -3957,10 +3716,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
 
 [[package]]
-name = "tokio"
-version = "1.21.1"
+name = "tls-listener"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95"
+checksum = "c9d4ff21187d434ac7709bfc7441ca88f63681247e5ad99f0f08c8c91ddc103d"
+dependencies = [
+ "futures-util",
+ "hyper",
+ "pin-project-lite",
+ "thiserror",
+ "tokio",
+ "tokio-rustls",
+]
+
+[[package]]
+name = "tokio"
+version = "1.24.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
 dependencies = [
  "autocfg",
  "bytes",
@@ -3968,12 +3741,11 @@ dependencies = [
  "memchr",
  "mio",
  "num_cpus",
- "once_cell",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "winapi",
+ "windows-sys 0.42.0",
 ]
 
 [[package]]
@@ -3997,28 +3769,19 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "tokio-native-tls"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
-dependencies = [
- "native-tls",
- "tokio",
-]
-
 [[package]]
 name = "tokio-postgres"
-version = "0.7.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+version = "0.7.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
  "fallible-iterator",
- "futures",
+ "futures-channel",
+ "futures-util",
  "log",
- "parking_lot 0.12.1",
+ "parking_lot",
  "percent-encoding",
  "phf",
  "pin-project-lite",
@@ -4065,6 +3828,32 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tokio-tar"
+version = "0.3.0"
+source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
+dependencies = [
+ "filetime",
+ "futures-core",
+ "libc",
+ "redox_syscall",
+ "tokio",
+ "tokio-stream",
+ "xattr",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.17.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f714dd15bead90401d77e04243611caec13726c2408afd5b31901dfcdcb3b181"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.4"
@@ -4109,7 +3898,7 @@ dependencies = [
  "async-stream",
  "async-trait",
  "axum",
- "base64",
+ "base64 0.13.1",
  "bytes",
  "futures-core",
  "futures-util",
@@ -4291,6 +4080,25 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
 
+[[package]]
+name = "tungstenite"
+version = "0.17.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e27992fd6a8c29ee7eef28fc78349aa244134e10ad447ce3b9f0ac0ed0fa4ce0"
+dependencies = [
+ "base64 0.13.1",
+ "byteorder",
+ "bytes",
+ "http",
+ "httparse",
+ "log",
+ "rand",
+ "sha-1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -4351,12 +4159,14 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "chunked_transfer",
  "log",
- "native-tls",
  "once_cell",
+ "rustls",
  "url",
+ "webpki",
+ "webpki-roots",
 ]
 
 [[package]]
@@ -4377,6 +4187,12 @@ version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9"
 
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
 [[package]]
 name = "utils"
 version = "0.1.0"
@@ -4393,7 +4209,7 @@ dependencies = [
  "hyper",
  "jsonwebtoken",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "once_cell",
  "pq_proto",
  "rand",
@@ -4417,12 +4233,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "uuid"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
-
 [[package]]
 name = "uuid"
 version = "1.2.2"
@@ -4439,12 +4249,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -4743,9 +4547,9 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
- "ahash",
  "anyhow",
  "bytes",
+ "chrono",
  "clap 4.0.29",
  "crossbeam-utils",
  "either",
@@ -4766,11 +4570,10 @@ dependencies = [
  "rand",
  "regex",
  "regex-syntax",
- "reqwest",
  "scopeguard",
  "serde",
+ "serde_json",
  "socket2",
- "stable_deref_trait",
  "syn",
  "tokio",
  "tokio-util",
@@ -4787,7 +4590,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8"
 dependencies = [
  "asn1-rs",
- "base64",
+ "base64 0.13.1",
  "data-encoding",
  "der-parser",
  "lazy_static",
diff --git a/Cargo.toml b/Cargo.toml
index 2f73215d3f..927900d5c8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,4 +86,4 @@ lto = true
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 [patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index ad036338a0..e7fba49bb1 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include
 
-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
@@ -207,7 +204,8 @@ RUN apt update &&  \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 USER postgres
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index 4526644421..cd03525b97 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include
 
-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
@@ -207,7 +204,8 @@ RUN apt update &&  \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 USER postgres
diff --git a/Makefile b/Makefile
index 4711dc1c7d..92a4532684 100644
--- a/Makefile
+++ b/Makefile
@@ -61,146 +61,115 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-v14-headers postgres-v15-headers
+neon: postgres-headers
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
 
 ### PostgreSQL parts
-# The rules are duplicated for Postgres v14 and 15. We may want to refactor
+# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
-$(POSTGRES_INSTALL_DIR)/build/v14/config.status:
-	+@echo "Configuring Postgres v14 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+	+@echo "Configuring Postgres $* build"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
+	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
 		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
-
-$(POSTGRES_INSTALL_DIR)/build/v15/config.status:
-	+@echo "Configuring Postgres v15 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
-		CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
 
 # nicer alias to run 'configure'
-.PHONY: postgres-v14-configure
-postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
-
-.PHONY: postgres-v15-configure
-postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+# Note: I've been unable to use templates for this part of our configuration.
+# I'm not sure why it wouldn't work, but this is the only place (apart from
+# the "build-all-versions" entry points) where direct mention of PostgreSQL
+# versions is used.
+.PHONY: postgres-configure-v15
+postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+.PHONY: postgres-configure-v14
+postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
 
 # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
-.PHONY: postgres-v14-headers
-postgres-v14-headers: postgres-v14-configure
-	+@echo "Installing PostgreSQL v14 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install
-
-.PHONY: postgres-v15-headers
-postgres-v15-headers: postgres-v15-configure
-	+@echo "Installing PostgreSQL v15 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install
+.PHONY: postgres-headers-%
+postgres-headers-%: postgres-configure-%
+	+@echo "Installing PostgreSQL $* headers"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install
 
 # Compile and install PostgreSQL
-.PHONY: postgres-v14
-postgres-v14: postgres-v14-configure \
-		  postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
-	+@echo "Compiling libpq v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install
+.PHONY: postgres-%
+postgres-%: postgres-configure-% \
+		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
+	+@echo "Compiling PostgreSQL $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
+	+@echo "Compiling libpq $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
+	+@echo "Compiling pg_buffercache $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 
-.PHONY: postgres-v15
-postgres-v15: postgres-v15-configure \
-		  postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
-	+@echo "Compiling libpq v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install
+.PHONY: postgres-clean-%
+postgres-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
 
-# shorthand to build all Postgres versions
-postgres: postgres-v14 postgres-v15
+.PHONY: neon-pg-ext-%
+neon-pg-ext-%: postgres-%
+	+@echo "Compiling neon $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+	+@echo "Compiling neon_walredo $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_test_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
 
-.PHONY: postgres-v14-clean
-postgres-v14-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean
+.PHONY: neon-pg-ext-clean-%
+neon-pg-ext-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
 
-.PHONY: postgres-v15-clean
-postgres-v15-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean
-
-neon-pg-ext-v14: postgres-v14
-	+@echo "Compiling neon v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v14
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
-
-neon-pg-ext-v15: postgres-v15
-	+@echo "Compiling neon v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v15
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
+.PHONY: neon-pg-ext
+neon-pg-ext: \
+	neon-pg-ext-v14 \
+	neon-pg-ext-v15
 
 .PHONY: neon-pg-ext-clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
+neon-pg-ext-clean: \
+	neon-pg-ext-clean-v14 \
+	neon-pg-ext-clean-v15
 
-neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15
-postgres-headers: postgres-v14-headers postgres-v15-headers
-postgres-clean: postgres-v14-clean postgres-v15-clean
+# shorthand to build all Postgres versions
+.PHONY: postgres
+postgres: \
+	postgres-v14 \
+	postgres-v15
+
+.PHONY: postgres-headers
+postgres-headers: \
+	postgres-headers-v14 \
+	postgres-headers-v15
+
+.PHONY: postgres-clean
+postgres-clean: \
+	postgres-clean-v14 \
+	postgres-clean-v15
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
-clean:
-	cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean
-	cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean
+clean: postgres-clean neon-pg-ext-clean
 	$(CARGO_CMD_PREFIX) cargo clean
-	cd pgxn/neon && $(MAKE) clean
-	cd pgxn/neon_test_utils && $(MAKE) clean
 
 # This removes everything
 .PHONY: distclean
diff --git a/README.md b/README.md
index 30bde949a9..7b629e71a5 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
-  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
+  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
+  protobuf-devel
 ```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -117,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
-pageserver started, pid: 2545906
-Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
-Stopped pageserver 1 process with pid 2545906
 
-# start pageserver and safekeeper
+# start pageserver, safekeeper, and broker for their intercommunication
 > ./target/debug/neon_local start
 Starting neon broker at 127.0.0.1:50051
 storage_broker started, pid: 2918372
@@ -130,6 +128,12 @@ pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
 
+# create initial tenant and use it as a default for every future neon_local invocation
+> ./target/debug/neon_local tenant create --set-default
+tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
+Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
+Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
+
 # start postgres compute node
 > ./target/debug/neon_local pg start main
 Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index a35cef197d..4c65649610 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -2,6 +2,7 @@
 name = "compute_tools"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
@@ -12,12 +13,12 @@ futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
 notify = "5.0.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 7786d7af9c..f3b787209d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -105,7 +105,7 @@ fn main() -> Result<()> {
         tenant,
         timeline,
         pageserver_connstr,
-        metrics: ComputeMetrics::new(),
+        metrics: ComputeMetrics::default(),
         state: RwLock::new(ComputeState::new()),
     };
     let compute = Arc::new(compute_state);
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index b6ba1692f9..ee1605c814 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -5,7 +5,7 @@ use tokio_postgres::NoTls;
 
 use crate::compute::ComputeNode;
 
-pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
+pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     let query = "
     CREATE TABLE IF NOT EXISTS health_check (
         id serial primary key,
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index bfdd2340ec..c2c9ab2230 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,11 +23,11 @@ use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::info;
+use log::{info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
-use crate::checker::create_writablity_check_data;
+use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -91,7 +91,7 @@ pub enum ComputeStatus {
     Failed,
 }
 
-#[derive(Serialize)]
+#[derive(Default, Serialize)]
 pub struct ComputeMetrics {
     pub sync_safekeepers_ms: AtomicU64,
     pub basebackup_ms: AtomicU64,
@@ -99,23 +99,6 @@ pub struct ComputeMetrics {
     pub total_startup_ms: AtomicU64,
 }
 
-impl ComputeMetrics {
-    pub fn new() -> Self {
-        Self {
-            sync_safekeepers_ms: AtomicU64::new(0),
-            basebackup_ms: AtomicU64::new(0),
-            config_ms: AtomicU64::new(0),
-            total_startup_ms: AtomicU64::new(0),
-        }
-    }
-}
-
-impl Default for ComputeMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl ComputeNode {
     pub fn set_status(&self, status: ComputeStatus) {
         self.state.write().unwrap().status = status;
@@ -175,7 +158,7 @@ impl ComputeNode {
         let start_time = Utc::now();
 
         let sync_handle = Command::new(&self.pgbin)
-            .args(&["--sync-safekeepers"])
+            .args(["--sync-safekeepers"])
             .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
             .stdout(Stdio::piped())
             .spawn()
@@ -253,7 +236,7 @@ impl ComputeNode {
 
         // Run postgres as a child process.
         let mut pg = Command::new(&self.pgbin)
-            .args(&["-D", &self.pgdata])
+            .args(["-D", &self.pgdata])
             .spawn()
             .expect("cannot start postgres process");
 
@@ -292,7 +275,7 @@ impl ComputeNode {
         handle_databases(&self.spec, &mut client)?;
         handle_role_deletions(self, &mut client)?;
         handle_grants(self, &mut client)?;
-        create_writablity_check_data(&mut client)?;
+        create_writability_check_data(&mut client)?;
 
         // 'Close' connection
         drop(client);
@@ -328,6 +311,9 @@ impl ComputeNode {
             .wait()
             .expect("failed to start waiting on Postgres process");
 
+        self.check_for_core_dumps()
+            .expect("failed to check for core dumps");
+
         Ok(ecode)
     }
 
@@ -343,4 +329,68 @@ impl ComputeNode {
         self.prepare_pgdata()?;
         self.run()
     }
+
+    // Look for core dumps and collect backtraces.
+    //
+    // EKS worker nodes have following core dump settings:
+    //   /proc/sys/kernel/core_pattern -> core
+    //   /proc/sys/kernel/core_uses_pid -> 1
+    //   ulimint -c -> unlimited
+    // which results in core dumps being written to postgres data directory as core.<pid>.
+    //
+    // Use that as a default location and pattern, except macos where core dumps are written
+    // to /cores/ directory by default.
+    fn check_for_core_dumps(&self) -> Result<()> {
+        let core_dump_dir = match std::env::consts::OS {
+            "macos" => Path::new("/cores/"),
+            _ => Path::new(&self.pgdata),
+        };
+
+        // Collect core dump paths if any
+        info!("checking for core dumps in {}", core_dump_dir.display());
+        let files = fs::read_dir(core_dump_dir)?;
+        let cores = files.filter_map(|entry| {
+            let entry = entry.ok()?;
+            let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
+            Some(entry.path())
+        });
+
+        // Print backtrace for each core dump
+        for core_path in cores {
+            warn!(
+                "core dump found: {}, collecting backtrace",
+                core_path.display()
+            );
+
+            // Try first with gdb
+            let backtrace = Command::new("gdb")
+                .args(["--batch", "-q", "-ex", "bt", &self.pgbin])
+                .arg(&core_path)
+                .output();
+
+            // Try lldb if no gdb is found -- that is handy for local testing on macOS
+            let backtrace = match backtrace {
+                Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    warn!("cannot find gdb, trying lldb");
+                    Command::new("lldb")
+                        .arg("-c")
+                        .arg(&core_path)
+                        .args(["--batch", "-o", "bt all", "-o", "quit"])
+                        .output()
+                }
+                _ => backtrace,
+            }?;
+
+            warn!(
+                "core dump backtrace: {}",
+                String::from_utf8_lossy(&backtrace.stdout)
+            );
+            warn!(
+                "debugger stderr: {}",
+                String::from_utf8_lossy(&backtrace.stderr)
+            );
+        }
+
+        Ok(())
+    }
 }
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 4c8bbc608b..44f83e5003 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,29 +9,11 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use log::{error, info};
 use serde_json;
 
-use crate::compute::{ComputeNode, ComputeStatus};
+use crate::compute::ComputeNode;
 
 // Service function to handle all available routes.
 async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
     match (req.method(), req.uri().path()) {
-        // Timestamp of the last Postgres activity in the plain text.
-        // DEPRECATED in favour of /status
-        (&Method::GET, "/last_activity") => {
-            info!("serving /last_active GET request");
-            let state = compute.state.read().unwrap();
-
-            // Use RFC3339 format for consistency.
-            Response::new(Body::from(state.last_active.to_rfc3339()))
-        }
-
-        // Has compute setup process finished? -> true/false.
-        // DEPRECATED in favour of /status
-        (&Method::GET, "/ready") => {
-            info!("serving /ready GET request");
-            let status = compute.get_status();
-            Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
-        }
-
         // Serialized compute state.
         (&Method::GET, "/status") => {
             info!("serving /status GET request");
@@ -46,16 +28,6 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
             Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
         }
 
-        // DEPRECATED, use POST instead
-        (&Method::GET, "/check_writability") => {
-            info!("serving /check_writability GET request");
-            let res = crate::checker::check_writability(&compute).await;
-            match res {
-                Ok(_) => Response::new(Body::from("true")),
-                Err(e) => Response::new(Body::from(e.to_string())),
-            }
-        }
-
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
             let res = crate::checker::check_writability(&compute).await;
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 9c0f8e3ccd..a857531d26 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,58 +37,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeMetrics"
 
-  /ready:
-    get:
-      deprecated: true
-      tags:
-      - "info"
-      summary: Check whether compute startup process finished successfully
-      description: ""
-      operationId: computeIsReady
-      responses:
-        "200":
-          description: Compute is ready ('true') or not ('false')
-          content:
-            text/plain:
-              schema:
-                type: string
-                example: "true"
-
-  /last_activity:
-    get:
-      deprecated: true
-      tags:
-      - "info"
-      summary: Get timestamp of the last compute activity
-      description: ""
-      operationId: getLastComputeActivityTS
-      responses:
-        "200":
-          description: Timestamp of the last compute activity
-          content:
-            text/plain:
-              schema:
-                type: string
-                example: "2022-10-12T07:20:50.52Z"
-
   /check_writability:
-    get:
-      deprecated: true
-      tags:
-      - "check"
-      summary: Check that we can write new data on this compute
-      description: ""
-      operationId: checkComputeWritabilityDeprecated
-      responses:
-        "200":
-          description: Check result
-          content:
-            text/plain:
-              schema:
-                type: string
-                description: Error text or 'true' if check passed
-                example: "true"
-
     post:
       tags:
       - "check"
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 58cdf796bc..c871422e78 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     let mut idle_backs: Vec<DateTime<Utc>> = vec![];
 
                     for b in backs.into_iter() {
-                        let state: String = b.get("state");
-                        let change: String = b.get("state_change");
+                        let state: String = match b.try_get("state") {
+                            Ok(state) => state,
+                            Err(_) => continue,
+                        };
 
                         if state == "idle" {
+                            let change: String = match b.try_get("state_change") {
+                                Ok(state_change) => state_change,
+                                Err(_) => continue,
+                            };
                             let change = DateTime::parse_from_rfc3339(&change);
                             match change {
                                 Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
@@ -74,10 +80,8 @@ fn watch_compute_activity(compute: &ComputeNode) {
                         }
                     }
 
-                    // Sort idle backend `state_change` timestamps. The last one corresponds
-                    // to the last activity.
-                    idle_backs.sort();
-                    if let Some(last) = idle_backs.last() {
+                    // Get idle backend `state_change` with the max timestamp.
+                    if let Some(last) = idle_backs.iter().max() {
                         last_active = *last;
                     }
                 }
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 289f223bda..ff422f1cf5 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -119,16 +119,9 @@ pub trait GenericOptionsSearch {
 impl GenericOptionsSearch for GenericOptions {
     /// Lookup option by name
     fn find(&self, name: &str) -> Option<String> {
-        match &self {
-            Some(ops) => {
-                let op = ops.iter().find(|s| s.name == name);
-                match op {
-                    Some(op) => op.value.clone(),
-                    None => None,
-                }
-            }
-            None => None,
-        }
+        let ops = self.as_ref()?;
+        let op = ops.iter().find(|s| s.name == name)?;
+        op.value.clone()
     }
 }
 
@@ -161,6 +154,14 @@ impl Role {
 }
 
 impl Database {
+    pub fn new(name: PgIdent, owner: PgIdent) -> Self {
+        Self {
+            name,
+            owner,
+            options: None,
+        }
+    }
+
     /// Serialize a list of database parameters into a Postgres-acceptable
     /// string of arguments.
     /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
@@ -219,11 +220,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
             &[],
         )?
         .iter()
-        .map(|row| Database {
-            name: row.get("datname"),
-            owner: row.get("owner"),
-            options: None,
-        })
+        .map(|row| Database::new(row.get("datname"), row.get("owner")))
         .collect();
 
     Ok(postgres_dbs)
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 58c94d74ae..81e01fe555 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,5 +1,6 @@
 use std::path::Path;
 use std::str::FromStr;
+use std::time::Instant;
 
 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
@@ -197,22 +198,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
 /// Reassign all dependent objects and delete requested roles.
 pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    let spec = &node.spec;
-
-    // First, reassign all dependent objects to db owners.
-    if let Some(ops) = &spec.delta_operations {
+    if let Some(ops) = &node.spec.delta_operations {
+        // First, reassign all dependent objects to db owners.
         info!("reassigning dependent objects of to-be-deleted roles");
         for op in ops {
             if op.action == "delete_role" {
                 reassign_owned_objects(node, &op.name)?;
             }
         }
-    }
 
-    // Second, proceed with role deletions.
-    let mut xact = client.transaction()?;
-    if let Some(ops) = &spec.delta_operations {
+        // Second, proceed with role deletions.
         info!("processing role deletions");
+        let mut xact = client.transaction()?;
         for op in ops {
             // We do not check either role exists or not,
             // Postgres will take care of it for us
@@ -223,6 +220,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
                 xact.execute(query.as_str(), &[])?;
             }
         }
+        xact.commit()?;
     }
 
     Ok(())
@@ -317,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
         // XXX: with a limited number of databases it is fine, but consider making it a HashMap
         let pg_db = existing_dbs.iter().find(|r| r.name == *name);
 
+        let start_time = Instant::now();
         if let Some(r) = pg_db {
             // XXX: db owner name is returned as quoted string from Postgres,
             // when quoting is needed.
@@ -335,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 info_print!(" -> update");
 
                 client.execute(query.as_str(), &[])?;
+                let elapsed = start_time.elapsed().as_millis();
+                info_print!(" ({} ms)", elapsed);
             }
         } else {
             let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
@@ -342,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
             query.push_str(&db.to_pg_options());
             client.execute(query.as_str(), &[])?;
+
+            let elapsed = start_time.elapsed().as_millis();
+            info_print!(" ({} ms)", elapsed);
         }
 
         info_print!("\n");
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index 24cad4663a..431d9794bc 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -38,4 +38,33 @@ mod pg_helpers_tests {
 
         assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
     }
+
+    #[test]
+    fn generic_options_search() {
+        let generic_options: GenericOptions = Some(vec![
+            GenericOption {
+                name: "present_value".into(),
+                value: Some("value".into()),
+                vartype: "string".into(),
+            },
+            GenericOption {
+                name: "missed_value".into(),
+                value: None,
+                vartype: "int".into(),
+            },
+        ]);
+        assert_eq!(generic_options.find("present_value"), Some("value".into()));
+        assert_eq!(generic_options.find("missed_value"), None);
+        assert_eq!(generic_options.find("invalid_value"), None);
+
+        let empty_generic_options: GenericOptions = Some(vec![]);
+        assert_eq!(empty_generic_options.find("present_value"), None);
+        assert_eq!(empty_generic_options.find("missed_value"), None);
+        assert_eq!(empty_generic_options.find("invalid_value"), None);
+
+        let none_generic_options: GenericOptions = None;
+        assert_eq!(none_generic_options.find("present_value"), None);
+        assert_eq!(none_generic_options.find("missed_value"), None);
+        assert_eq!(none_generic_options.find("invalid_value"), None);
+    }
 }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 00b34aafb1..1c6cd6d882 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -2,6 +2,7 @@
 name = "control_plane"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
@@ -10,7 +11,7 @@ comfy-table = "6.1"
 git-version = "0.3.5"
 nix = "0.25"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 8909e27c94..1f3f8f45ea 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -136,22 +136,6 @@ where
     anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }
 
-/// Send SIGTERM to child process
-pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
-    let pid = child.id();
-    match kill(
-        nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
-        Signal::SIGTERM,
-    ) {
-        Ok(()) => Ok(()),
-        Err(Errno::ESRCH) => {
-            println!("child process with pid {pid} does not exist");
-            Ok(())
-        }
-        Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
-    }
-}
-
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
 pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
     let pid = match pid_file::read(pid_file)
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f0c3b983f0..4b2aa3c957 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
     } else if let Some(default_id) = env.default_tenant_id {
         Ok(default_id)
     } else {
-        bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
+        anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant");
     }
 }
 
@@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }
 
 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let initial_timeline_id_arg = parse_timeline_id(init_match)?;
-
     // Create config file
     let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
         // load and parse the file
@@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
         LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
     env.init(pg_version)
         .context("Failed to initialize neon repository")?;
-    let initial_tenant_id = env
-        .default_tenant_id
-        .expect("default_tenant_id should be generated by the `env.init()` call above");
 
     // Initialize pageserver, create initial tenant and timeline.
     let pageserver = PageServerNode::from_env(&env);
-    let initial_timeline_id = pageserver
-        .initialize(
-            Some(initial_tenant_id),
-            initial_timeline_id_arg,
-            &pageserver_config_overrides(init_match),
-            pg_version,
-        )
+    pageserver
+        .initialize(&pageserver_config_overrides(init_match))
         .unwrap_or_else(|e| {
             eprintln!("pageserver init failed: {e:?}");
             exit(1);
         });
 
-    env.register_branch_mapping(
-        DEFAULT_BRANCH_NAME.to_owned(),
-        initial_tenant_id,
-        initial_timeline_id,
-    )?;
-
     Ok(env)
 }
 
@@ -388,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
             println!(
                 "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
             );
+
+            if create_match.get_flag("set-default") {
+                println!("Setting tenant {new_tenant_id} as a default one");
+                env.default_tenant_id = Some(new_tenant_id);
+            }
+        }
+        Some(("set-default", set_default_match)) => {
+            let tenant_id =
+                parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
+            println!("Setting tenant {tenant_id} as a default one");
+            env.default_tenant_id = Some(tenant_id);
         }
         Some(("config", create_match)) => {
             let tenant_id = get_tenant_id(create_match, env)?;
@@ -549,7 +544,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
 
             table.load_preset(comfy_table::presets::NOTHING);
 
-            table.set_header(&[
+            table.set_header([
                 "NODE",
                 "ADDRESS",
                 "TIMELINE",
@@ -584,7 +579,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                     .map(|name| name.as_str())
                     .unwrap_or("?");
 
-                table.add_row(&[
+                table.add_row([
                     node_name.as_str(),
                     &node.address.to_string(),
                     &node.timeline_id.to_string(),
@@ -747,7 +742,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
     if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
         Ok(SafekeeperNode::from_env(env, node))
     } else {
-        bail!("could not find safekeeper '{}'", id)
+        bail!("could not find safekeeper {id}")
     }
 }
 
@@ -806,22 +801,22 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
 }
 
 fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    broker::start_broker_process(env)?;
-    let pageserver = PageServerNode::from_env(env);
-
     // Postgres nodes are not started automatically
 
+    broker::start_broker_process(env)?;
+
+    let pageserver = PageServerNode::from_env(env);
     if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver start failed: {e}");
-        try_stop_storage_broker_process(env);
+        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
+        try_stop_all(env, true);
         exit(1);
     }
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start() {
-            eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
-            try_stop_storage_broker_process(env);
+            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
+            try_stop_all(env, false);
             exit(1);
         }
     }
@@ -832,35 +827,41 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
 
+    try_stop_all(env, immediate);
+
+    Ok(())
+}
+
+fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     let pageserver = PageServerNode::from_env(env);
 
     // Stop all compute nodes
-    let cplane = ComputeControlPlane::load(env.clone())?;
-    for (_k, node) in cplane.nodes {
-        if let Err(e) = node.stop(false) {
-            eprintln!("postgres stop failed: {}", e);
+    match ComputeControlPlane::load(env.clone()) {
+        Ok(cplane) => {
+            for (_k, node) in cplane.nodes {
+                if let Err(e) = node.stop(false) {
+                    eprintln!("postgres stop failed: {e:#}");
+                }
+            }
+        }
+        Err(e) => {
+            eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}")
         }
     }
 
     if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver stop failed: {}", e);
+        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
     }
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.stop(immediate) {
-            eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e);
+            eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e);
         }
     }
 
-    try_stop_storage_broker_process(env);
-
-    Ok(())
-}
-
-fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
     if let Err(e) = broker::stop_broker_process(env) {
-        eprintln!("neon broker stop failed: {e}");
+        eprintln!("neon broker stop failed: {e:#}");
     }
 }
 
@@ -900,6 +901,7 @@ fn cli() -> Command {
     let stop_mode_arg = Arg::new("stop-mode")
         .short('m')
         .value_parser(["fast", "immediate"])
+        .default_value("fast")
         .help("If 'immediate', don't flush repository data at shutdown")
         .required(false)
         .value_name("stop-mode");
@@ -921,9 +923,8 @@ fn cli() -> Command {
         .version(GIT_VERSION)
         .subcommand(
             Command::new("init")
-                .about("Initialize a new Neon repository")
+                .about("Initialize a new Neon repository, preparing configs for services to start with")
                 .arg(pageserver_config_args.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                 .arg(
                     Arg::new("config")
                         .long("config")
@@ -985,11 +986,14 @@ fn cli() -> Command {
                 .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
                 .arg(pg_version_arg.clone())
+                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
+                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
                 )
+            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
+                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
-                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
-                )
+                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index bd60580012..6c0604a076 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -17,7 +17,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
         "storage_broker",
         &env.base_data_dir,
         &env.storage_broker_bin(),
-        &args,
+        args,
         [],
         background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
         || {
diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs
index 0eec25c51e..8731cf2583 100644
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -44,7 +44,7 @@ impl ComputeControlPlane {
         let mut nodes = BTreeMap::default();
         let pgdatadirspath = &env.pg_data_dirs_path();
 
-        for tenant_dir in fs::read_dir(&pgdatadirspath)
+        for tenant_dir in fs::read_dir(pgdatadirspath)
             .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
         {
             let tenant_dir = tenant_dir?;
@@ -67,8 +67,8 @@ impl ComputeControlPlane {
     fn get_port(&mut self) -> u16 {
         1 + self
             .nodes
-            .iter()
-            .map(|(_name, node)| node.address.port())
+            .values()
+            .map(|node| node.address.port())
             .max()
             .unwrap_or(self.base_port)
     }
@@ -183,7 +183,7 @@ impl PostgresNode {
 
     fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
         let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
-        let mut cmd = Command::new(&pg_path);
+        let mut cmd = Command::new(pg_path);
 
         cmd.arg("--sync-safekeepers")
             .env_clear()
@@ -201,7 +201,7 @@ impl PostgresNode {
             .stderr(Stdio::piped());
 
         if let Some(token) = auth_token {
-            cmd.env("ZENITH_AUTH_TOKEN", token);
+            cmd.env("NEON_AUTH_TOKEN", token);
         }
 
         let sync_handle = cmd
@@ -261,7 +261,7 @@ impl PostgresNode {
     }
 
     fn create_pgdata(&self) -> Result<()> {
-        fs::create_dir_all(&self.pgdata()).with_context(|| {
+        fs::create_dir_all(self.pgdata()).with_context(|| {
             format!(
                 "could not create data directory {}",
                 self.pgdata().display()
@@ -304,17 +304,17 @@ impl PostgresNode {
 
             // Set up authentication
             //
-            // $ZENITH_AUTH_TOKEN will be replaced with value from environment
+            // $NEON_AUTH_TOKEN will be replaced with value from environment
             // variable during compute pg startup. It is done this way because
             // otherwise user will be able to retrieve the value using SHOW
             // command or pg_settings
             let password = if let AuthType::NeonJWT = auth_type {
-                "$ZENITH_AUTH_TOKEN"
+                "$NEON_AUTH_TOKEN"
             } else {
                 ""
             };
             // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
-            // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
+            // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
             // We parse this string and build it back with token from env var, and for simplicity rebuild
             // uses only needed variables namely host, port, user, password.
             format!("postgresql://no_user:{password}@{host}:{port}")
@@ -323,7 +323,7 @@ impl PostgresNode {
         conf.append_line("");
         conf.append("neon.pageserver_connstring", &pageserver_connstr);
         if let AuthType::NeonJWT = auth_type {
-            conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN");
+            conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
         }
         conf.append("neon.tenant_id", &self.tenant_id.to_string());
         conf.append("neon.timeline_id", &self.timeline_id.to_string());
@@ -448,7 +448,7 @@ impl PostgresNode {
             self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
         );
         if let Some(token) = auth_token {
-            cmd.env("ZENITH_AUTH_TOKEN", token);
+            cmd.env("NEON_AUTH_TOKEN", token);
         }
 
         let pg_ctl = cmd.output().context("pg_ctl failed")?;
@@ -478,7 +478,7 @@ impl PostgresNode {
                 postgresql_conf_path.to_str().unwrap()
             )
         })?;
-        fs::remove_dir_all(&self.pgdata())?;
+        fs::remove_dir_all(self.pgdata())?;
         self.create_pgdata()?;
 
         // 2. Bring back config files
@@ -514,7 +514,7 @@ impl PostgresNode {
                 "Destroying postgres data directory '{}'",
                 self.pgdata().to_str().unwrap()
             );
-            fs::remove_dir_all(&self.pgdata())?;
+            fs::remove_dir_all(self.pgdata())?;
         } else {
             self.pg_ctl(&["stop"], &None)?;
         }
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index ed9e467eee..003152c578 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -296,11 +296,6 @@ impl LocalEnv {
             env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
         }
 
-        // If no initial tenant ID was given, generate it.
-        if env.default_tenant_id.is_none() {
-            env.default_tenant_id = Some(TenantId::generate());
-        }
-
         env.base_data_dir = base_path();
 
         Ok(env)
@@ -404,7 +399,7 @@ impl LocalEnv {
             }
         }
 
-        fs::create_dir(&base_path)?;
+        fs::create_dir(base_path)?;
 
         // generate keys for jwt
         // openssl genrsa -out private_key.pem 2048
@@ -413,7 +408,7 @@ impl LocalEnv {
             private_key_path = base_path.join("auth_private_key.pem");
             let keygen_output = Command::new("openssl")
                 .arg("genrsa")
-                .args(&["-out", private_key_path.to_str().unwrap()])
+                .args(["-out", private_key_path.to_str().unwrap()])
                 .arg("2048")
                 .stdout(Stdio::null())
                 .output()
@@ -430,10 +425,10 @@ impl LocalEnv {
             // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
             let keygen_output = Command::new("openssl")
                 .arg("rsa")
-                .args(&["-in", private_key_path.to_str().unwrap()])
+                .args(["-in", private_key_path.to_str().unwrap()])
                 .arg("-pubout")
-                .args(&["-outform", "PEM"])
-                .args(&["-out", public_key_path.to_str().unwrap()])
+                .args(["-outform", "PEM"])
+                .args(["-out", public_key_path.to_str().unwrap()])
                 .stdout(Stdio::null())
                 .output()
                 .context("failed to generate auth private key")?;
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3575e75db9..9cebe028e4 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -7,7 +7,7 @@ use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::{io, result};
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 use pageserver_api::models::{
     TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
@@ -130,83 +130,15 @@ impl PageServerNode {
         overrides
     }
 
-    /// Initializes a pageserver node by creating its config with the overrides provided,
-    /// and creating an initial tenant and timeline afterwards.
-    pub fn initialize(
-        &self,
-        create_tenant: Option<TenantId>,
-        initial_timeline_id: Option<TimelineId>,
-        config_overrides: &[&str],
-        pg_version: u32,
-    ) -> anyhow::Result<TimelineId> {
+    /// Initializes a pageserver node by creating its config with the overrides provided.
+    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
         // First, run `pageserver --init` and wait for it to write a config into FS and exit.
         self.pageserver_init(config_overrides).with_context(|| {
             format!(
                 "Failed to run init for pageserver node {}",
                 self.env.pageserver.id,
             )
-        })?;
-
-        // Then, briefly start it fully to run HTTP commands on it,
-        // to create initial tenant and timeline.
-        // We disable the remote storage, since we stop pageserver right after the timeline creation,
-        // hence most of the uploads will either aborted or not started: no point to start them at all.
-        let disabled_remote_storage_override = "remote_storage={}";
-        let mut pageserver_process = self
-            .start_node(
-                &[disabled_remote_storage_override],
-                // Previous overrides will be taken from the config created before, don't overwrite them.
-                false,
-            )
-            .with_context(|| {
-                format!(
-                    "Failed to start a process for pageserver node {}",
-                    self.env.pageserver.id,
-                )
-            })?;
-
-        let init_result = self
-            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
-            .context("Failed to create initial tenant and timeline for pageserver");
-        match &init_result {
-            Ok(initial_timeline_id) => {
-                println!("Successfully initialized timeline {initial_timeline_id}")
-            }
-            Err(e) => eprintln!("{e:#}"),
-        }
-        background_process::send_stop_child_process(&pageserver_process)?;
-
-        let exit_code = pageserver_process.wait()?;
-        ensure!(
-            exit_code.success(),
-            format!(
-                "pageserver init failed with exit code {:?}",
-                exit_code.code()
-            )
-        );
-        println!(
-            "Stopped pageserver {} process with pid {}",
-            self.env.pageserver.id,
-            pageserver_process.id(),
-        );
-        init_result
-    }
-
-    fn try_init_timeline(
-        &self,
-        new_tenant_id: Option<TenantId>,
-        new_timeline_id: Option<TimelineId>,
-        pg_version: u32,
-    ) -> anyhow::Result<TimelineId> {
-        let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
-        let initial_timeline_info = self.timeline_create(
-            initial_tenant_id,
-            new_timeline_id,
-            None,
-            None,
-            Some(pg_version),
-        )?;
-        Ok(initial_timeline_info.timeline_id)
+        })
     }
 
     pub fn repo_path(&self) -> PathBuf {
@@ -241,7 +173,7 @@ impl PageServerNode {
         let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
         args.push(Cow::Borrowed("--init"));
 
-        let init_output = Command::new(&self.env.pageserver_bin())
+        let init_output = Command::new(self.env.pageserver_bin())
             .args(args.iter().map(Cow::as_ref))
             .envs(self.pageserver_env_variables()?)
             .output()
@@ -320,7 +252,7 @@ impl PageServerNode {
             let token = self
                 .env
                 .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
-            vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
+            vec![("NEON_AUTH_TOKEN".to_owned(), token)]
         } else {
             Vec::new()
         })
diff --git a/deny.toml b/deny.toml
new file mode 100644
index 0000000000..3a0fe36f87
--- /dev/null
+++ b/deny.toml
@@ -0,0 +1,90 @@
+# This file was auto-generated using `cargo deny init`.
+# cargo-deny is a cargo plugin that lets you lint your project's
+# dependency graph to ensure all your dependencies conform
+# to your expectations and requirements.
+
+# Root options
+targets = []
+all-features = false
+no-default-features = false
+feature-depth = 1
+
+# This section is considered when running `cargo deny check advisories`
+# More documentation for the advisories section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
+[advisories]
+db-urls = ["https://github.com/rustsec/advisory-db"]
+vulnerability = "deny"
+unmaintained = "warn"
+yanked = "warn"
+notice = "warn"
+ignore = []
+
+# This section is considered when running `cargo deny check licenses`
+# More documentation for the licenses section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
+[licenses]
+unlicensed = "deny"
+allow = [
+    "Apache-2.0",
+    "Artistic-2.0",
+    "BSD-2-Clause",
+    "BSD-3-Clause",
+    "ISC",
+    "MIT",
+    "MPL-2.0",
+    "OpenSSL",
+    "Unicode-DFS-2016",
+]
+deny = []
+copyleft = "warn"
+allow-osi-fsf-free = "neither"
+default = "deny"
+confidence-threshold = 0.8
+exceptions = [
+    # Zlib license has some restrictions if we decide to change sth
+    { allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
+    { allow = ["Zlib"], name = "const_format", version = "*" },
+]
+
+[[licenses.clarify]]
+name = "ring"
+version = "*"
+expression = "MIT AND ISC AND OpenSSL"
+license-files = [
+    { path = "LICENSE", hash = 0xbd0eed23 },
+]
+
+[licenses.private]
+ignore = true
+registries = []
+
+# This section is considered when running `cargo deny check bans`.
+# More documentation about the 'bans' section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
+[bans]
+multiple-versions = "warn"
+wildcards = "allow"
+highlight = "all"
+workspace-default-features = "allow"
+external-default-features = "allow"
+allow = []
+deny = []
+skip = []
+skip-tree = []
+
+# This section is considered when running `cargo deny check sources`.
+# More documentation about the 'sources' section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
+[sources]
+unknown-registry = "warn"
+unknown-git = "warn"
+allow-registry = ["https://github.com/rust-lang/crates.io-index"]
+allow-git = []
+
+[sources.allow-org]
+github = [
+    "neondatabase",
+]
+gitlab = []
+bitbucket = []
diff --git a/docs/authentication.md b/docs/authentication.md
index 0752fae19f..e22d7b700f 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL.
 
 #### Outgoing connections
 Compute connects to Pageserver for getting pages.
-The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`.
+The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
 The environment variable inside the connection string is substituted with
 the JWT token.
 
@@ -77,7 +77,7 @@ If the GUC is unset, no token is passed.
 
 Note that both tokens can be (and typically are) the same;
 the scope is the tenant and the token is usually passed through the
-`$ZENITH_AUTH_TOKEN` environment variable.
+`$NEON_AUTH_TOKEN` environment variable.
 
 ### Pageserver
 #### Overview
@@ -114,7 +114,7 @@ either of three values:
 Pageserver makes a connection to a Safekeeper for each active timeline.
 As Pageserver may want to access any timeline it has on the disk,
 it is given a blanket JWT token to access any data on any Safekeeper.
-This token is passed through an environment variable called `ZENITH_AUTH_TOKEN`
+This token is passed through an environment variable called `NEON_AUTH_TOKEN`
 (non-configurable as of writing this text).
 
 A better way _may be_ to store JWT token for each timeline next to it,
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index d0cd46d2a9..d155f1e07d 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -2,6 +2,7 @@
 name = "metrics"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 2102ae5373..68d4c609f0 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pageserver_api"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e49b7051d2..d954e5d21f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -163,6 +163,8 @@ pub struct TenantInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub id: TenantId,
     pub state: TenantState,
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
     pub has_in_progress_downloads: Option<bool>,
 }
@@ -191,9 +193,12 @@ pub struct TimelineInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub remote_consistent_lsn: Lsn,
     pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
     pub current_logical_size_non_incremental: Option<u64>,
-    pub current_physical_size_non_incremental: Option<u64>,
+
+    pub timeline_dir_layer_file_size_sum: Option<u64>,
 
     pub wal_source_connstr: Option<String>,
     #[serde_as(as = "Option<DisplayFromStr>")]
@@ -203,29 +208,22 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
-
-    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
-    // compatility with older clients.
-    pub local: LocalTimelineInfo,
-    pub remote: RemoteTimelineInfo,
 }
 
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct LocalTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_lsn: Option<Lsn>,
-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
-    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
+pub struct DownloadRemoteLayersTaskInfo {
+    pub task_id: String,
+    pub state: DownloadRemoteLayersTaskState,
+    pub total_layer_count: u64,         // stable once `completed`
+    pub successful_download_count: u64, // stable once `completed`
+    pub failed_download_count: u64,     // stable once `completed`
 }
 
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct RemoteTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub remote_consistent_lsn: Option<Lsn>,
+pub enum DownloadRemoteLayersTaskState {
+    Running,
+    Completed,
+    ShutDown,
 }
 
 pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
@@ -325,7 +323,7 @@ impl PagestreamFeMessage {
         match self {
             Self::Exists(req) => {
                 bytes.put_u8(0);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -335,7 +333,7 @@ impl PagestreamFeMessage {
 
             Self::Nblocks(req) => {
                 bytes.put_u8(1);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -345,7 +343,7 @@ impl PagestreamFeMessage {
 
             Self::GetPage(req) => {
                 bytes.put_u8(2);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -356,7 +354,7 @@ impl PagestreamFeMessage {
 
             Self::DbSize(req) => {
                 bytes.put_u8(3);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.dbnode);
             }
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 314f3c6f1c..12b7abcc93 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -2,14 +2,15 @@
 name = "postgres_connection"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
 anyhow = "1.0"
 itertools = "0.10.3"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 01ff6ab60e..aa076b08d3 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -2,6 +2,7 @@
 name = "postgres_ffi"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 rand = "0.8.3"
@@ -21,7 +22,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [dev-dependencies]
 env_logger = "0.9"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 wal_craft = { path = "wal_craft" }
 
 [build-dependencies]
diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs
index 01e5554b8a..5acf90be70 100644
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -14,8 +14,8 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
         status
     );
 
-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
 
     let bshift: u8 =
         ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
@@ -25,13 +25,13 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
 }
 
 pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
 
     let bshift: u8 =
         ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
 
-    ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
+    (page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK
 }
 
 // See CLOGPagePrecedes in clog.c
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 953723a8f0..272c4d6dcc 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -333,7 +333,7 @@ impl CheckPoint {
 // We need this segment to start compute node.
 //
 pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
-    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize);
+    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);
 
     let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
     let hdr = XLogLongPageHeaderData {
@@ -574,7 +574,7 @@ mod tests {
 
         // Rename file to partial to actually find last valid lsn, then rename it back.
         fs::rename(
-            cfg.wal_dir().join(&last_segment),
+            cfg.wal_dir().join(last_segment),
             cfg.wal_dir().join(format!("{}.partial", last_segment)),
         )
         .unwrap();
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 4c35c5a650..abfc263550 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -2,7 +2,7 @@
 name = "wal_craft"
 version = "0.1.0"
 edition = "2021"
-
+license = "Apache-2.0"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
@@ -11,7 +11,7 @@ clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres_ffi = { path = "../" }
 tempfile = "3.2"
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index feec3b2ace..969befc8e7 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -81,7 +81,7 @@ impl Conf {
             .new_pg_command("initdb")?
             .arg("-D")
             .arg(self.datadir.as_os_str())
-            .args(&["-U", "postgres", "--no-instructions", "--no-sync"])
+            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
             .output()?;
         debug!("initdb output: {:?}", output);
         ensure!(
@@ -105,12 +105,12 @@ impl Conf {
         let unix_socket_dir_path = unix_socket_dir.path().to_owned();
         let server_process = self
             .new_pg_command("postgres")?
-            .args(&["-c", "listen_addresses="])
+            .args(["-c", "listen_addresses="])
             .arg("-k")
             .arg(unix_socket_dir_path.as_os_str())
             .arg("-D")
             .arg(self.datadir.as_os_str())
-            .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
             .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
             .stderr(Stdio::from(log_file))
             .spawn()?;
@@ -142,7 +142,7 @@ impl Conf {
         );
         let output = self
             .new_pg_command("pg_waldump")?
-            .args(&[
+            .args([
                 &first_segment_file.as_os_str(),
                 &last_segment_file.as_os_str(),
             ])
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 4d48e431b4..b9c6a1eab0 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -2,15 +2,17 @@
 name = "pq_proto"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
 bytes = "1.0.1"
 pin-project-lite = "0.2.7"
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
 tracing = "0.1"
+thiserror = "1.0"
 
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 2e311dd6e3..c5e4dbd1f0 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -5,7 +5,7 @@
 // Tools for calling certain async methods in sync contexts.
 pub mod sync;
 
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
 use serde::{Deserialize, Serialize};
@@ -194,6 +194,35 @@ macro_rules! retry_read {
     };
 }
 
+/// An error occured during connection being open.
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionError {
+    /// IO error during writing to or reading from the connection socket.
+    #[error("Socket IO error: {0}")]
+    Socket(std::io::Error),
+    /// Invalid packet was received from client
+    #[error("Protocol error: {0}")]
+    Protocol(String),
+    /// Failed to parse a protocol mesage
+    #[error("Message parse error: {0}")]
+    MessageParse(anyhow::Error),
+}
+
+impl From<anyhow::Error> for ConnectionError {
+    fn from(e: anyhow::Error) -> Self {
+        Self::MessageParse(e)
+    }
+}
+
+impl ConnectionError {
+    pub fn into_io_error(self) -> io::Error {
+        match self {
+            ConnectionError::Socket(io) => io,
+            other => io::Error::new(io::ErrorKind::Other, other.to_string()),
+        }
+    }
+}
+
 impl FeMessage {
     /// Read one message from the stream.
     /// This function returns `Ok(None)` in case of EOF.
@@ -216,7 +245,9 @@ impl FeMessage {
     /// }
     /// ```
     #[inline(never)]
-    pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
+    pub fn read(
+        stream: &mut (impl io::Read + Unpin),
+    ) -> Result<Option<FeMessage>, ConnectionError> {
         Self::read_fut(&mut AsyncishRead(stream)).wait()
     }
 
@@ -224,7 +255,7 @@ impl FeMessage {
     /// See documentation for `Self::read`.
     pub fn read_fut<Reader>(
         stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
+    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
     where
         Reader: tokio::io::AsyncRead + Unpin,
     {
@@ -238,17 +269,21 @@ impl FeMessage {
             let tag = match retry_read!(stream.read_u8().await) {
                 Ok(b) => b,
                 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(e.into()),
+                Err(e) => return Err(ConnectionError::Socket(e)),
             };
 
             // The message length includes itself, so it better be at least 4.
-            let len = retry_read!(stream.read_u32().await)?
+            let len = retry_read!(stream.read_u32().await)
+                .map_err(ConnectionError::Socket)?
                 .checked_sub(4)
-                .context("invalid message length")?;
+                .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
 
             let body = {
                 let mut buffer = vec![0u8; len as usize];
-                stream.read_exact(&mut buffer).await?;
+                stream
+                    .read_exact(&mut buffer)
+                    .await
+                    .map_err(ConnectionError::Socket)?;
                 Bytes::from(buffer)
             };
 
@@ -265,7 +300,11 @@ impl FeMessage {
                 b'c' => Ok(Some(FeMessage::CopyDone)),
                 b'f' => Ok(Some(FeMessage::CopyFail)),
                 b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
-                tag => bail!("unknown message tag: {},'{:?}'", tag, body),
+                tag => {
+                    return Err(ConnectionError::Protocol(format!(
+                        "unknown message tag: {tag},'{body:?}'"
+                    )))
+                }
             }
         })
     }
@@ -275,7 +314,9 @@ impl FeStartupPacket {
     /// Read startup message from the stream.
     // XXX: It's tempting yet undesirable to accept `stream` by value,
     // since such a change will cause user-supplied &mut references to be consumed
-    pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
+    pub fn read(
+        stream: &mut (impl io::Read + Unpin),
+    ) -> Result<Option<FeMessage>, ConnectionError> {
         Self::read_fut(&mut AsyncishRead(stream)).wait()
     }
 
@@ -284,7 +325,7 @@ impl FeStartupPacket {
     // since such a change will cause user-supplied &mut references to be consumed
     pub fn read_fut<Reader>(
         stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
+    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
     where
         Reader: tokio::io::AsyncRead + Unpin,
     {
@@ -302,31 +343,41 @@ impl FeStartupPacket {
             let len = match retry_read!(stream.read_u32().await) {
                 Ok(len) => len as usize,
                 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(e.into()),
+                Err(e) => return Err(ConnectionError::Socket(e)),
             };
 
             #[allow(clippy::manual_range_contains)]
             if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
-                bail!("invalid message length");
+                return Err(ConnectionError::Protocol(format!(
+                    "invalid message length {len}"
+                )));
             }
 
-            let request_code = retry_read!(stream.read_u32().await)?;
+            let request_code =
+                retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
 
             // the rest of startup packet are params
             let params_len = len - 8;
             let mut params_bytes = vec![0u8; params_len];
-            stream.read_exact(params_bytes.as_mut()).await?;
+            stream
+                .read_exact(params_bytes.as_mut())
+                .await
+                .map_err(ConnectionError::Socket)?;
 
             // Parse params depending on request code
             let req_hi = request_code >> 16;
             let req_lo = request_code & ((1 << 16) - 1);
             let message = match (req_hi, req_lo) {
                 (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
-                    ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
+                    if params_len != 8 {
+                        return Err(ConnectionError::Protocol(
+                            "expected 8 bytes for CancelRequest params".to_string(),
+                        ));
+                    }
                     let mut cursor = Cursor::new(params_bytes);
                     FeStartupPacket::CancelRequest(CancelKeyData {
-                        backend_pid: cursor.read_i32().await?,
-                        cancel_key: cursor.read_i32().await?,
+                        backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
+                        cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
                     })
                 }
                 (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
@@ -338,7 +389,9 @@ impl FeStartupPacket {
                     FeStartupPacket::GssEncRequest
                 }
                 (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
-                    bail!("Unrecognized request code {}", unrecognized_code)
+                    return Err(ConnectionError::Protocol(format!(
+                        "Unrecognized request code {unrecognized_code}"
+                    )));
                 }
                 // TODO bail if protocol major_version is not 3?
                 (major_version, minor_version) => {
@@ -346,15 +399,21 @@ impl FeStartupPacket {
                     // See `postgres: ProcessStartupPacket, build_startup_packet`.
                     let mut tokens = str::from_utf8(&params_bytes)
                         .context("StartupMessage params: invalid utf-8")?
-                        .strip_suffix('\0') // drop packet's own null terminator
-                        .context("StartupMessage params: missing null terminator")?
+                        .strip_suffix('\0') // drop packet's own null
+                        .ok_or_else(|| {
+                            ConnectionError::Protocol(
+                                "StartupMessage params: missing null terminator".to_string(),
+                            )
+                        })?
                         .split_terminator('\0');
 
                     let mut params = HashMap::new();
                     while let Some(name) = tokens.next() {
-                        let value = tokens
-                            .next()
-                            .context("StartupMessage params: key without value")?;
+                        let value = tokens.next().ok_or_else(|| {
+                            ConnectionError::Protocol(
+                                "StartupMessage params: key without value".to_string(),
+                            )
+                        })?;
 
                         params.insert(name.to_owned(), value.to_owned());
                     }
@@ -458,12 +517,15 @@ pub enum BeMessage<'a> {
     CloseComplete,
     // None means column is NULL
     DataRow(&'a [Option<&'a [u8]>]),
-    ErrorResponse(&'a str),
+    ErrorResponse(&'a str, Option<&'a [u8; 5]>),
     /// Single byte - used in response to SSLRequest/GSSENCRequest.
     EncryptionResponse(bool),
     NoData,
     ParameterDescription,
-    ParameterStatus(BeParameterStatusMessage<'a>),
+    ParameterStatus {
+        name: &'a [u8],
+        value: &'a [u8],
+    },
     ParseComplete,
     ReadyForQuery,
     RowDescription(&'a [RowDescriptor<'a>]),
@@ -472,6 +534,28 @@ pub enum BeMessage<'a> {
     KeepAlive(WalSndKeepAlive),
 }
 
+/// Common shorthands.
+impl<'a> BeMessage<'a> {
+    /// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8.
+    /// This is a sensible default, given that:
+    ///  * rust strings only support this encoding out of the box.
+    ///  * tokio-postgres, postgres-jdbc (and probably more) mandate it.
+    ///
+    /// TODO: do we need to report `server_encoding` as well?
+    pub const CLIENT_ENCODING: Self = Self::ParameterStatus {
+        name: b"client_encoding",
+        value: b"UTF8",
+    };
+
+    /// Build a [`BeMessage::ParameterStatus`] holding the server version.
+    pub fn server_version(version: &'a str) -> Self {
+        Self::ParameterStatus {
+            name: b"server_version",
+            value: version.as_bytes(),
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum BeAuthenticationSaslMessage<'a> {
     Methods(&'a [&'a str]),
@@ -485,12 +569,6 @@ pub enum BeParameterStatusMessage<'a> {
     ServerVersion(&'a str),
 }
 
-impl BeParameterStatusMessage<'static> {
-    pub fn encoding() -> BeMessage<'static> {
-        BeMessage::ParameterStatus(Self::Encoding("UTF8"))
-    }
-}
-
 // One row description in RowDescription packet.
 #[derive(Debug)]
 pub struct RowDescriptor<'a> {
@@ -587,14 +665,15 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }
 
 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
-    if s.contains(&0) {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
+    let bytes = s.as_ref();
+    if bytes.contains(&0) {
         return Err(io::Error::new(
             io::ErrorKind::InvalidInput,
             "string contains embedded null",
         ));
     }
-    buf.put_slice(s);
+    buf.put_slice(bytes);
     buf.put_u8(0);
     Ok(())
 }
@@ -606,6 +685,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
     Ok(result)
 }
 
+pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
+
 impl<'a> BeMessage<'a> {
     /// Write message to the given buf.
     // Unlike the reading side, we use BytesMut
@@ -644,7 +725,7 @@ impl<'a> BeMessage<'a> {
                         Methods(methods) => {
                             buf.put_i32(10); // Specifies that SASL auth method is used.
                             for method in methods.iter() {
-                                write_cstr(method.as_bytes(), buf)?;
+                                write_cstr(method, buf)?;
                             }
                             buf.put_u8(0); // zero terminator for the list
                         }
@@ -745,10 +826,7 @@ impl<'a> BeMessage<'a> {
             // First byte of each field represents type of this field. Set just enough fields
             // to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
             // message text.
-            BeMessage::ErrorResponse(error_msg) => {
-                // For all the errors set Severity to Error and error code to
-                // 'internal error'.
-
+            BeMessage::ErrorResponse(error_msg, pg_error_code) => {
                 // 'E' signalizes ErrorResponse messages
                 buf.put_u8(b'E');
                 write_body(buf, |buf| {
@@ -756,10 +834,12 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"ERROR\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(b"CXX000\0");
+                    buf.put_slice(&terminate_code(
+                        pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR),
+                    ));
 
                     buf.put_u8(b'M'); // the message
-                    write_cstr(error_msg.as_bytes(), buf)?;
+                    write_cstr(error_msg, buf)?;
 
                     buf.put_u8(0); // terminator
                     Ok::<_, io::Error>(())
@@ -779,7 +859,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"NOTICE\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(b"CXX000\0");
+                    buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR));
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg.as_bytes(), buf)?;
@@ -799,24 +879,12 @@ impl<'a> BeMessage<'a> {
                 buf.put_u8(response);
             }
 
-            BeMessage::ParameterStatus(param) => {
-                use std::io::{IoSlice, Write};
-                use BeParameterStatusMessage::*;
-
-                let [name, value] = match param {
-                    Encoding(name) => [b"client_encoding", name.as_bytes()],
-                    ServerVersion(version) => [b"server_version", version.as_bytes()],
-                };
-
-                // Parameter names and values are passed as null-terminated strings
-                let iov = &mut [name, b"\0", value, b"\0"].map(IoSlice::new);
-                let mut buffer = [0u8; 64]; // this should be enough
-                let cnt = buffer.as_mut().write_vectored(iov).unwrap();
-
+            BeMessage::ParameterStatus { name, value } => {
                 buf.put_u8(b'S');
                 write_body(buf, |buf| {
-                    buf.put_slice(&buffer[..cnt]);
-                });
+                    write_cstr(name, buf)?;
+                    write_cstr(value, buf)
+                })?;
             }
 
             BeMessage::ParameterDescription => {
@@ -873,7 +941,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_u8(b'k');
                     buf.put_u64(req.sent_ptr);
                     buf.put_i64(req.timestamp);
-                    buf.put_u8(if req.request_reply { 1 } else { 0 });
+                    buf.put_u8(u8::from(req.request_reply));
                 });
             }
         }
@@ -1079,3 +1147,12 @@ mod tests {
         let _ = FeStartupPacket::read_fut(stream).await;
     }
 }
+
+fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
+    let mut terminated = [0; 6];
+    for (i, &elem) in code.iter().enumerate() {
+        terminated[i] = elem;
+    }
+
+    terminated
+}
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index ebd30fc1eb..5a39f27209 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -2,6 +2,7 @@
 name = "remote_storage"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 28858fcbab..1091a8bd5c 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -7,6 +7,7 @@
 //!
 mod local_fs;
 mod s3_bucket;
+mod simulate_failures;
 
 use std::{
     collections::HashMap,
@@ -24,7 +25,7 @@ use tokio::io;
 use toml_edit::Item;
 use tracing::info;
 
-pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
 
 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -77,7 +78,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
     /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
     /// so this method doesnt need to.
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
@@ -150,6 +154,7 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
     LocalFs(LocalFs),
     AwsS3(Arc<S3Bucket>),
+    Unreliable(Arc<UnreliableWrapper>),
 }
 
 impl Deref for GenericRemoteStorage {
@@ -159,27 +164,30 @@ impl Deref for GenericRemoteStorage {
         match self {
             GenericRemoteStorage::LocalFs(local_fs) => local_fs,
             GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
+            GenericRemoteStorage::Unreliable(s) => s.as_ref(),
         }
     }
 }
 
 impl GenericRemoteStorage {
-    pub fn from_config(
-        storage_config: &RemoteStorageConfig,
-    ) -> anyhow::Result<GenericRemoteStorage> {
+    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs(root) => {
                 info!("Using fs root '{}' as a remote storage", root.display());
-                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
+                Self::LocalFs(LocalFs::new(root.clone())?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
             }
         })
     }
 
+    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    }
+
     /// Takes storage object contents and its size and uploads to remote storage,
     /// mapping `from_path` to the corresponding remote object id in the storage.
     ///
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 50a84eb33f..f1289569ae 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -92,13 +92,17 @@ impl RemoteStorage for LocalFs {
             .collect())
     }
 
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         let path = match prefix {
             Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
             None => Cow::Borrowed(&self.storage_root),
         };
         Ok(get_all_files(path.as_ref(), false)
-            .await?
+            .await
+            .map_err(DownloadError::Other)?
             .into_iter()
             .map(|path| {
                 path.strip_prefix(&self.storage_root)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 740f3753d8..18a2c5dedd 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -286,7 +286,10 @@ impl RemoteStorage for S3Bucket {
 
     /// See the doc for `RemoteStorage::list_prefixes`
     /// Note: it wont include empty "directories"
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
             .map(|p| self.relative_path_to_s3_object(p))
@@ -308,7 +311,8 @@ impl RemoteStorage for S3Bucket {
                 .concurrency_limiter
                 .acquire()
                 .await
-                .context("Concurrency limiter semaphore got closed during S3 list")?;
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
 
             metrics::inc_list_objects();
 
@@ -324,7 +328,9 @@ impl RemoteStorage for S3Bucket {
                 .map_err(|e| {
                     metrics::inc_list_objects_fail();
                     e
-                })?;
+                })
+                .context("Failed to list S3 prefixes")
+                .map_err(DownloadError::Other)?;
 
             document_keys.extend(
                 fetch_response
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
new file mode 100644
index 0000000000..643bb99dce
--- /dev/null
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -0,0 +1,129 @@
+//! This module provides a wrapper around a real RemoteStorage implementation that
+//! causes the first N attempts at each upload or download operatio to fail. For
+//! testing purposes.
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
+
+pub struct UnreliableWrapper {
+    inner: crate::GenericRemoteStorage,
+
+    // This many attempts of each operation will fail, then we let it succeed.
+    attempts_to_fail: u64,
+
+    // Tracks how many failed attempts of each operation has been made.
+    attempts: Mutex<HashMap<RemoteOp, u64>>,
+}
+
+/// Used to identify retries of different unique operation.
+#[derive(Debug, Hash, Eq, PartialEq)]
+enum RemoteOp {
+    List,
+    ListPrefixes(Option<RemotePath>),
+    Upload(RemotePath),
+    Download(RemotePath),
+    Delete(RemotePath),
+}
+
+impl UnreliableWrapper {
+    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+        assert!(attempts_to_fail > 0);
+        UnreliableWrapper {
+            inner,
+            attempts_to_fail,
+            attempts: Mutex::new(HashMap::new()),
+        }
+    }
+
+    ///
+    /// Common functionality for all operations.
+    ///
+    /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
+    /// attempts, let the operation go ahead, and clear the counter.
+    ///
+    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
+        let mut attempts = self.attempts.lock().unwrap();
+
+        match attempts.entry(op) {
+            Entry::Occupied(mut e) => {
+                let attempts_before_this = {
+                    let p = e.get_mut();
+                    *p += 1;
+                    *p
+                };
+
+                if attempts_before_this >= self.attempts_to_fail {
+                    // let it succeed
+                    e.remove();
+                    Ok(attempts_before_this)
+                } else {
+                    let error =
+                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                    Err(DownloadError::Other(error))
+                }
+            }
+            Entry::Vacant(e) => {
+                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                e.insert(1);
+                Err(DownloadError::Other(error))
+            }
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for UnreliableWrapper {
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::List)?;
+        self.inner.list().await
+    }
+
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list_prefixes(prefix).await
+    }
+
+    async fn upload(
+        &self,
+        data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        // S3 PUT request requires the content length to be specified,
+        // otherwise it starts to fail with the concurrent connection count increasing.
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.upload(data, data_size_bytes, to, metadata).await
+    }
+
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner.download(from).await
+    }
+
+    async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        // Note: We treat any download_byte_range as an "attempt" of the same
+        // operation. We don't pay attention to the ranges. That's good enough
+        // for now.
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner
+            .download_byte_range(from, start_inclusive, end_exclusive)
+            .await
+    }
+
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Delete(path.clone()))?;
+        self.inner.delete(path).await
+    }
+}
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 15bdecd71d..32cda78be4 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -2,6 +2,7 @@
 name = "safekeeper_api"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index 1aabf5a4f9..3a1a0f7915 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -3,6 +3,7 @@ name = "tenant_size_model"
 version = "0.1.0"
 edition = "2021"
 publish = false
+license = "Apache-2.0"
 
 [dependencies]
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 47639e8205..9c7fcafe23 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -2,9 +2,10 @@
 name = "utils"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
-sentry = "0.29.0"
+sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 3726779cb2..2c7e6e20ab 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -157,34 +157,34 @@ mod tests {
         assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
 
         let invalid_dir_path = file_path.join("folder");
-        create_dir_all(&invalid_dir_path).unwrap_err();
+        create_dir_all(invalid_dir_path).unwrap_err();
     }
 
     #[test]
     fn test_path_with_suffix_extension() {
         let p = PathBuf::from("/foo/bar");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp").to_string_lossy(),
             "/foo/bar.temp"
         );
         let p = PathBuf::from("/foo/bar");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
             "/foo/bar.temp.temp"
         );
         let p = PathBuf::from("/foo/bar.baz");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
             "/foo/bar.baz.temp.temp"
         );
         let p = PathBuf::from("/foo/bar.baz");
         assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
             "/foo/bar.baz..temp"
         );
         let p = PathBuf::from("/foo/bar/dir/");
         assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
             "/foo/bar/dir..temp"
         );
     }
diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
index 89f7197718..f3e3835bda 100644
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -3,11 +3,11 @@
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
 
+use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
-use rand::Rng;
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::io::{self, Write};
@@ -22,25 +22,32 @@ pub trait Handler {
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care).
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
+    fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
 
     /// Called on startup packet receival, allows to process params.
     ///
     /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
     /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
     /// to override whole init logic in implementations.
-    fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         Ok(())
     }
 
-    /// Check auth md5
-    fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
-        bail!("MD5 auth failed")
-    }
-
     /// Check auth jwt
-    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
-        bail!("JWT auth failed")
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
     }
 
     fn is_shutdown_requested(&self) -> bool {
@@ -61,7 +68,6 @@ pub enum ProtoState {
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub enum AuthType {
     Trust,
-    MD5,
     // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
     NeonJWT,
 }
@@ -72,9 +78,8 @@ impl FromStr for AuthType {
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         match s {
             "Trust" => Ok(Self::Trust),
-            "MD5" => Ok(Self::MD5),
             "NeonJWT" => Ok(Self::NeonJWT),
-            _ => bail!("invalid value \"{s}\" for auth type"),
+            _ => anyhow::bail!("invalid value \"{s}\" for auth type"),
         }
     }
 }
@@ -83,7 +88,6 @@ impl fmt::Display for AuthType {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.write_str(match self {
             AuthType::Trust => "Trust",
-            AuthType::MD5 => "MD5",
             AuthType::NeonJWT => "NeonJWT",
         })
     }
@@ -134,7 +138,6 @@ pub struct PostgresBackend {
 
     pub state: ProtoState,
 
-    md5_salt: [u8; 4],
     auth_type: AuthType,
 
     peer_addr: SocketAddr,
@@ -164,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
 }
 
 // Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
     std::str::from_utf8(without_null).map_err(|e| e.into())
 }
@@ -187,7 +190,6 @@ impl PostgresBackend {
             stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
             buf_out: BytesMut::with_capacity(10 * 1024),
             state: ProtoState::Initialization,
-            md5_salt: [0u8; 4],
             auth_type,
             tls_config,
             peer_addr,
@@ -199,10 +201,10 @@ impl PostgresBackend {
     }
 
     /// Get direct reference (into the Option) to the read stream.
-    fn get_stream_in(&mut self) -> Result<&mut BidiStream> {
+    fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
         match &mut self.stream {
             Some(Stream::Bidirectional(stream)) => Ok(stream),
-            _ => bail!("reader taken"),
+            _ => anyhow::bail!("reader taken"),
         }
     }
 
@@ -226,7 +228,7 @@ impl PostgresBackend {
     }
 
     /// Read full message or return None if connection is closed.
-    pub fn read_message(&mut self) -> Result<Option<FeMessage>> {
+    pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
         let (state, stream) = (self.state, self.get_stream_in()?);
 
         use ProtoState::*;
@@ -234,6 +236,7 @@ impl PostgresBackend {
             Initialization | Encrypted => FeStartupPacket::read(stream),
             Authentication | Established => FeMessage::read(stream),
         }
+        .map_err(QueryError::from)
     }
 
     /// Write message into internal output buffer.
@@ -257,7 +260,7 @@ impl PostgresBackend {
     }
 
     // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub fn run(mut self, handler: &mut impl Handler) -> Result<()> {
+    pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
         let ret = self.run_message_loop(handler);
         if let Some(stream) = self.stream.as_mut() {
             let _ = stream.shutdown(Shutdown::Both);
@@ -265,7 +268,7 @@ impl PostgresBackend {
         ret
     }
 
-    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> {
+    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
         trace!("postgres backend to {:?} started", self.peer_addr);
 
         let mut unnamed_query_string = Bytes::new();
@@ -274,7 +277,7 @@ impl PostgresBackend {
             match self.read_message() {
                 Ok(message) => {
                     if let Some(msg) = message {
-                        trace!("got message {:?}", msg);
+                        trace!("got message {msg:?}");
 
                         match self.process_message(handler, msg, &mut unnamed_query_string)? {
                             ProcessMsgResult::Continue => continue,
@@ -285,10 +288,12 @@ impl PostgresBackend {
                     }
                 }
                 Err(e) => {
-                    // If it is a timeout error, continue the loop
-                    if !is_socket_read_timed_out(&e) {
-                        return Err(e);
+                    if let QueryError::Other(e) = &e {
+                        if is_socket_read_timed_out(e) {
+                            continue;
+                        }
                     }
+                    return Err(e);
                 }
             }
         }
@@ -306,7 +311,7 @@ impl PostgresBackend {
             }
             stream => {
                 self.stream = stream;
-                bail!("can't start TLs without bidi stream");
+                anyhow::bail!("can't start TLs without bidi stream");
             }
         }
     }
@@ -316,17 +321,16 @@ impl PostgresBackend {
         handler: &mut impl Handler,
         msg: FeMessage,
         unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
         // TODO: change that to proper top-level match of protocol state with separate message handling for each state
-        if self.state < ProtoState::Established {
-            ensure!(
-                matches!(
-                    msg,
-                    FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
-                ),
-                "protocol violation"
-            );
+        if self.state < ProtoState::Established
+            && !matches!(
+                msg,
+                FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
+            )
+        {
+            return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
         }
 
         let have_tls = self.tls_config.is_some();
@@ -350,8 +354,13 @@ impl PostgresBackend {
                     }
                     FeStartupPacket::StartupMessage { .. } => {
                         if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
-                            bail!("client did not connect with TLS");
+                            self.write_message(&BeMessage::ErrorResponse(
+                                "must connect with TLS",
+                                None,
+                            ))?;
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "client did not connect with TLS"
+                            )));
                         }
 
                         // NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -361,21 +370,12 @@ impl PostgresBackend {
                         match self.auth_type {
                             AuthType::Trust => {
                                 self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                                    .write_message_noflush(&BeParameterStatusMessage::encoding())?
+                                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                                     // The async python driver requires a valid server_version
-                                    .write_message_noflush(&BeMessage::ParameterStatus(
-                                        BeParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message_noflush(&BeMessage::server_version("14.1"))?
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
-                            AuthType::MD5 => {
-                                rand::thread_rng().fill(&mut self.md5_salt);
-                                self.write_message(&BeMessage::AuthenticationMD5Password(
-                                    self.md5_salt,
-                                ))?;
-                                self.state = ProtoState::Authentication;
-                            }
                             AuthType::NeonJWT => {
                                 self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
                                 self.state = ProtoState::Authentication;
@@ -395,25 +395,20 @@ impl PostgresBackend {
 
                 match self.auth_type {
                     AuthType::Trust => unreachable!(),
-                    AuthType::MD5 => {
-                        let (_, md5_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_md5(self, md5_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
-                        }
-                    }
                     AuthType::NeonJWT => {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
                         if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
+                            self.write_message(&BeMessage::ErrorResponse(
+                                &e.to_string(),
+                                Some(e.pg_error_code()),
+                            ))?;
+                            return Err(e);
                         }
                     }
                 }
                 self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                    .write_message_noflush(&BeParameterStatusMessage::encoding())?
+                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                     .write_message(&BeMessage::ReadyForQuery)?;
                 self.state = ProtoState::Established;
             }
@@ -422,33 +417,14 @@ impl PostgresBackend {
                 // remove null terminator
                 let query_string = cstr_to_str(&body)?;
 
-                trace!("got query {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string) {
-                    // ":?" uses the alternate formatting style, which makes anyhow display the
-                    // full cause of the error, not just the top-level context + its trace.
-                    // We don't want to send that in the ErrorResponse though,
-                    // because it's not relevant to the compute node logs.
-                    //
-                    // We also don't want to log full stacktrace when the error is primitive,
-                    // such as usual connection closed.
-                    let short_error = format!("{:#}", e);
-                    let root_cause = e.root_cause().to_string();
-                    if root_cause.contains("connection closed unexpectedly")
-                        || root_cause.contains("Broken pipe (os error 32)")
-                    {
-                        error!(
-                            "query handler for '{}' failed: {}",
-                            query_string, short_error
-                        );
-                    } else {
-                        error!("query handler for '{}' failed: {:?}", query_string, e);
-                    }
-                    self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?;
-                    // TODO: untangle convoluted control flow
-                    if e.to_string().contains("failed to run") {
-                        return Ok(ProcessMsgResult::Break);
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 self.write_message(&BeMessage::ReadyForQuery)?;
             }
@@ -473,11 +449,13 @@ impl PostgresBackend {
 
             FeMessage::Execute(_) => {
                 let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got execute {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string) {
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                    log_query_error(query_string, &e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 // NOTE there is no ReadyForQuery message. This handler is used
                 // for basebackup and it uses CopyOut which doesn't require
@@ -496,7 +474,9 @@ impl PostgresBackend {
             // We prefer explicit pattern matching to wildcards, because
             // this helps us spot the places where new variants are missing
             FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                bail!("unexpected message type: {:?}", msg);
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {msg:?}"
+                )));
             }
         }
 
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index 376819027b..95b7b3fd15 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -4,45 +4,87 @@
 //! is rather narrow, but we can extend it once required.
 
 use crate::postgres_backend::AuthType;
-use anyhow::{bail, Context, Result};
-use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
-use rand::Rng;
+use anyhow::Context;
+use bytes::{Buf, Bytes, BytesMut};
+use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
 use std::future::Future;
+use std::io;
 use std::net::SocketAddr;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::Poll;
-use tracing::{debug, error, trace};
+use tracing::{debug, error, info, trace};
 
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
 use tokio_rustls::TlsAcceptor;
 
+pub fn is_expected_io_error(e: &io::Error) -> bool {
+    use io::ErrorKind::*;
+    matches!(
+        e.kind(),
+        ConnectionRefused | ConnectionAborted | ConnectionReset
+    )
+}
+
+/// An error, occurred during query processing:
+/// either during the connection ([`ConnectionError`]) or before/after it.
+#[derive(thiserror::Error, Debug)]
+pub enum QueryError {
+    /// The connection was lost while processing the query.
+    #[error(transparent)]
+    Disconnected(#[from] ConnectionError),
+    /// Some other error
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<io::Error> for QueryError {
+    fn from(e: io::Error) -> Self {
+        Self::Disconnected(ConnectionError::Socket(e))
+    }
+}
+
+impl QueryError {
+    pub fn pg_error_code(&self) -> &'static [u8; 5] {
+        match self {
+            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
+        }
+    }
+}
+
 #[async_trait::async_trait]
 pub trait Handler {
     /// Handle single query.
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care).
-    async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
 
     /// Called on startup packet receival, allows to process params.
     ///
     /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
     /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
     /// to override whole init logic in implementations.
-    fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         Ok(())
     }
 
-    /// Check auth md5
-    fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
-        bail!("MD5 auth failed")
-    }
-
     /// Check auth jwt
-    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
-        bail!("JWT auth failed")
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
     }
 }
 
@@ -76,17 +118,14 @@ impl AsyncWrite for Stream {
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
         buf: &[u8],
-    ) -> Poll<Result<usize, std::io::Error>> {
+    ) -> Poll<io::Result<usize>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
             Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
             Self::Broken => unreachable!(),
         }
     }
-    fn poll_flush(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
             Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
@@ -96,7 +135,7 @@ impl AsyncWrite for Stream {
     fn poll_shutdown(
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    ) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
             Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
@@ -109,7 +148,7 @@ impl AsyncRead for Stream {
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
         buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    ) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
             Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
@@ -120,12 +159,14 @@ impl AsyncRead for Stream {
 
 pub struct PostgresBackend {
     stream: Stream,
+
     // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
+    // The data between 0 and "current position" as tracked by the bytes::Buf
+    // implementation of BytesMut, have already been written.
     buf_out: BytesMut,
 
     pub state: ProtoState,
 
-    md5_salt: [u8; 4],
     auth_type: AuthType,
 
     peer_addr: SocketAddr,
@@ -143,7 +184,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
 }
 
 // Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
     std::str::from_utf8(without_null).map_err(|e| e.into())
 }
@@ -153,14 +194,13 @@ impl PostgresBackend {
         socket: tokio::net::TcpStream,
         auth_type: AuthType,
         tls_config: Option<Arc<rustls::ServerConfig>>,
-    ) -> std::io::Result<Self> {
+    ) -> io::Result<Self> {
         let peer_addr = socket.peer_addr()?;
 
         Ok(Self {
             stream: Stream::Unencrypted(BufReader::new(socket)),
             buf_out: BytesMut::with_capacity(10 * 1024),
             state: ProtoState::Initialization,
-            md5_salt: [0u8; 4],
             auth_type,
             tls_config,
             peer_addr,
@@ -172,30 +212,68 @@ impl PostgresBackend {
     }
 
     /// Read full message or return None if connection is closed.
-    pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
         use ProtoState::*;
         match self.state {
             Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
             Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
             Closed => Ok(None),
         }
+        .map_err(QueryError::from)
     }
 
     /// Flush output buffer into the socket.
-    pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
-        self.stream.write_all(&self.buf_out).await?;
+    pub async fn flush(&mut self) -> io::Result<()> {
+        while self.buf_out.has_remaining() {
+            let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
+            self.buf_out.advance(bytes_written);
+        }
         self.buf_out.clear();
-        Ok(self)
+        Ok(())
     }
 
     /// Write message into internal output buffer.
-    pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
+    pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
         BeMessage::write(&mut self.buf_out, message)?;
         Ok(self)
     }
 
+    /// Returns an AsyncWrite implementation that wraps all the data written
+    /// to it in CopyData messages, and writes them to the connection
+    ///
+    /// The caller is responsible for sending CopyOutResponse and CopyDone messages.
+    pub fn copyout_writer(&mut self) -> CopyDataWriter {
+        CopyDataWriter { pgb: self }
+    }
+
+    /// A polling function that tries to write all the data from 'buf_out' to the
+    /// underlying stream.
+    fn poll_write_buf(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        while self.buf_out.has_remaining() {
+            match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
+                Poll::Ready(Ok(bytes_written)) => {
+                    self.buf_out.advance(bytes_written);
+                }
+                Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                Poll::Pending => return Poll::Pending,
+            }
+        }
+        Poll::Ready(Ok(()))
+    }
+
+    fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
+        Pin::new(&mut self.stream).poll_flush(cx)
+    }
+
     // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
+    pub async fn run<F, S>(
+        mut self,
+        handler: &mut impl Handler,
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
     where
         F: Fn() -> S,
         S: Future,
@@ -209,7 +287,7 @@ impl PostgresBackend {
         &mut self,
         handler: &mut impl Handler,
         shutdown_watcher: F,
-    ) -> Result<()>
+    ) -> Result<(), QueryError>
     where
         F: Fn() -> S,
         S: Future,
@@ -245,7 +323,7 @@ impl PostgresBackend {
                         return Ok(());
                     }
                 }
-                Ok::<(), anyhow::Error>(())
+                Ok::<(), QueryError>(())
             } => {
                 // Handshake complete.
                 result?;
@@ -290,14 +368,14 @@ impl PostgresBackend {
             self.stream = Stream::Tls(Box::new(tls_stream));
             return Ok(());
         };
-        bail!("TLS already started");
+        anyhow::bail!("TLS already started");
     }
 
     async fn process_handshake_message(
         &mut self,
         handler: &mut impl Handler,
         msg: FeMessage,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         assert!(self.state < ProtoState::Established);
         let have_tls = self.tls_config.is_some();
         match msg {
@@ -320,8 +398,13 @@ impl PostgresBackend {
                     }
                     FeStartupPacket::StartupMessage { .. } => {
                         if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
-                            bail!("client did not connect with TLS");
+                            self.write_message(&BeMessage::ErrorResponse(
+                                "must connect with TLS",
+                                None,
+                            ))?;
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "client did not connect with TLS"
+                            )));
                         }
 
                         // NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -331,21 +414,12 @@ impl PostgresBackend {
                         match self.auth_type {
                             AuthType::Trust => {
                                 self.write_message(&BeMessage::AuthenticationOk)?
-                                    .write_message(&BeParameterStatusMessage::encoding())?
+                                    .write_message(&BeMessage::CLIENT_ENCODING)?
                                     // The async python driver requires a valid server_version
-                                    .write_message(&BeMessage::ParameterStatus(
-                                        BeParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message(&BeMessage::server_version("14.1"))?
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
-                            AuthType::MD5 => {
-                                rand::thread_rng().fill(&mut self.md5_salt);
-                                self.write_message(&BeMessage::AuthenticationMD5Password(
-                                    self.md5_salt,
-                                ))?;
-                                self.state = ProtoState::Authentication;
-                            }
                             AuthType::NeonJWT => {
                                 self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
                                 self.state = ProtoState::Authentication;
@@ -366,25 +440,20 @@ impl PostgresBackend {
 
                 match self.auth_type {
                     AuthType::Trust => unreachable!(),
-                    AuthType::MD5 => {
-                        let (_, md5_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_md5(self, md5_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
-                        }
-                    }
                     AuthType::NeonJWT => {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
                         if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
+                            self.write_message(&BeMessage::ErrorResponse(
+                                &e.to_string(),
+                                Some(e.pg_error_code()),
+                            ))?;
+                            return Err(e);
                         }
                     }
                 }
                 self.write_message(&BeMessage::AuthenticationOk)?
-                    .write_message(&BeParameterStatusMessage::encoding())?
+                    .write_message(&BeMessage::CLIENT_ENCODING)?
                     .write_message(&BeMessage::ReadyForQuery)?;
                 self.state = ProtoState::Established;
             }
@@ -402,33 +471,28 @@ impl PostgresBackend {
         handler: &mut impl Handler,
         msg: FeMessage,
         unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
         // TODO: change that to proper top-level match of protocol state with separate message handling for each state
         assert!(self.state == ProtoState::Established);
 
         match msg {
             FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
-                bail!("protocol violation");
+                return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
             }
 
             FeMessage::Query(body) => {
                 // remove null terminator
                 let query_string = cstr_to_str(&body)?;
 
-                trace!("got query {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
-                    // ":?" uses the alternate formatting style, which makes anyhow display the
-                    // full cause of the error, not just the top-level context + its trace.
-                    // We don't want to send that in the ErrorResponse though,
-                    // because it's not relevant to the compute node logs.
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                    // TODO: untangle convoluted control flow
-                    if e.to_string().contains("failed to run") {
-                        return Ok(ProcessMsgResult::Break);
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 self.write_message(&BeMessage::ReadyForQuery)?;
             }
@@ -453,11 +517,13 @@ impl PostgresBackend {
 
             FeMessage::Execute(_) => {
                 let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got execute {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                    log_query_error(query_string, &e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 // NOTE there is no ReadyForQuery message. This handler is used
                 // for basebackup and it uses CopyOut which doesn't require
@@ -476,10 +542,99 @@ impl PostgresBackend {
             // We prefer explicit pattern matching to wildcards, because
             // this helps us spot the places where new variants are missing
             FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                bail!("unexpected message type: {:?}", msg);
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {:?}",
+                    msg
+                )));
             }
         }
 
         Ok(ProcessMsgResult::Continue)
     }
 }
+
+///
+/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
+/// messages.
+///
+
+pub struct CopyDataWriter<'a> {
+    pgb: &'a mut PostgresBackend,
+}
+
+impl<'a> AsyncWrite for CopyDataWriter<'a> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, std::io::Error>> {
+        let this = self.get_mut();
+
+        // It's not strictly required to flush between each message, but makes it easier
+        // to view in wireshark, and usually the messages that the callers write are
+        // decently-sized anyway.
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+
+        // CopyData
+        // XXX: if the input is large, we should split it into multiple messages.
+        // Not sure what the threshold should be, but the ultimate hard limit is that
+        // the length cannot exceed u32.
+        this.pgb.write_message(&BeMessage::CopyData(buf))?;
+
+        Poll::Ready(Ok(buf.len()))
+    }
+
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+        this.pgb.poll_flush(cx)
+    }
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+        this.pgb.poll_flush(cx)
+    }
+}
+
+pub fn short_error(e: &QueryError) -> String {
+    match e {
+        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Other(e) => format!("{e:#}"),
+    }
+}
+
+pub(super) fn log_query_error(query: &str, e: &QueryError) {
+    match e {
+        QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
+            if is_expected_io_error(io_error) {
+                info!("query handler for '{query}' failed with expected io error: {io_error}");
+            } else {
+                error!("query handler for '{query}' failed with io error: {io_error}");
+            }
+        }
+        QueryError::Disconnected(other_connection_error) => {
+            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
+        }
+        QueryError::Other(e) => {
+            error!("query handler for '{query}' failed: {e:?}");
+        }
+    }
+}
diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index bf330a482c..e3f0b505da 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -11,11 +11,13 @@ use tokio::time::timeout;
 
 /// An error happened while waiting for a number
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
-#[error("SeqWaitError")]
 pub enum SeqWaitError {
     /// The wait timeout was reached
+    #[error("seqwait timeout was reached")]
     Timeout,
+
     /// [`SeqWait::shutdown`] was called
+    #[error("SeqWait::shutdown was called")]
     Shutdown,
 }
 
diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs
index 5e4598daf1..b0e5a0bf6a 100644
--- a/libs/utils/src/sock_split.rs
+++ b/libs/utils/src/sock_split.rs
@@ -50,7 +50,7 @@ impl BufStream {
 
     /// Returns a reference to the underlying TcpStream.
     fn get_ref(&self) -> &TcpStream {
-        &*self.0.get_ref().0
+        &self.0.get_ref().0
     }
 }
 
diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs
index 248400c2c1..fae707f049 100644
--- a/libs/utils/tests/ssl_test.rs
+++ b/libs/utils/tests/ssl_test.rs
@@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use once_cell::sync::Lazy;
 
-use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
+use utils::{
+    postgres_backend::{AuthType, Handler, PostgresBackend},
+    postgres_backend_async::QueryError,
+};
 
 fn make_tcp_pair() -> (TcpStream, TcpStream) {
     let listener = TcpListener::bind("127.0.0.1:0").unwrap();
@@ -105,7 +108,7 @@ fn ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             self.got_query = query_string == QUERY;
             Ok(())
         }
@@ -152,7 +155,7 @@ fn no_ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             _query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             panic!()
         }
     }
@@ -212,7 +215,7 @@ fn server_forces_ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             _query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             panic!()
         }
     }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 54bbe4714d..1854b6762f 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pageserver"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [features]
 default = []
@@ -9,8 +10,6 @@ default = []
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints"]
 
-profiling = ["pprof"]
-
 [dependencies]
 amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
 anyhow = { version = "1.0", features = ["backtrace"] }
@@ -18,7 +17,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 byteorder = "1.4.3"
 bytes = "1.0.1"
-chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
+chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4.0", features = ["string"] }
 close_fds = "0.3.2"
 const_format = "0.2.21"
@@ -36,23 +35,22 @@ nix = "0.25"
 num-traits = "0.2.15"
 once_cell = "1.13.0"
 pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 rand = "0.8.3"
 regex = "1.4.5"
 rstar = "0.9.3"
 scopeguard = "1.1.0"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1"
+serde_json = { version = "1.0", features = ["raw_value"] }
 serde_with = "2.0"
 signal-hook = "0.3.10"
 svg_fmt = "0.4.1"
-tar = "0.4.33"
+tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.36"
@@ -69,6 +67,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
 tenant_size_model = { path = "../libs/tenant_size_model" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6001377811..6a01fdfc6f 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,8 +1,7 @@
 use anyhow::Result;
 use pageserver::repository::Key;
-use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::ValueReconstructState;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
 use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
@@ -163,7 +162,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
     c.bench_function("captest_uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
@@ -192,7 +191,7 @@ fn bench_from_real_project(c: &mut Criterion) {
     c.bench_function("real_map_uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
@@ -238,7 +237,7 @@ fn bench_sequential(c: &mut Criterion) {
         // Run the search queries
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 8f53fce027..61011c9f36 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -84,7 +84,7 @@ fn add_multithreaded_walredo_requesters(
 
                             barrier.wait();
 
-                            execute_all(input, &*manager).unwrap();
+                            execute_all(input, &manager).unwrap();
 
                             barrier.wait();
                         }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 973c3cd3a6..1978becf83 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,19 +10,24 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
-use itertools::Itertools;
 use std::fmt::Write as FmtWrite;
-use std::io;
-use std::io::Write;
-use std::sync::Arc;
 use std::time::SystemTime;
-use tar::{Builder, EntryType, Header};
+use tokio::io;
+use tokio::io::AsyncWrite;
 use tracing::*;
 
-use crate::tenant::Timeline;
+/// NB: This relies on a modified version of tokio_tar that does *not* write the
+/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
+/// without explicitly calling 'finish' or 'into_inner'!
+///
+/// See https://github.com/neondatabase/tokio-tar/pull/1
+///
+use tokio_tar::{Builder, EntryType, Header};
+
+use crate::tenant::{with_ondemand_download, Timeline};
 use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -33,116 +38,130 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;
 
+/// Create basebackup with non-rel data in it.
+/// Only include relational data if 'full_backup' is true.
+///
+/// Currently we use empty 'req_lsn' in two cases:
+///  * During the basebackup right after timeline creation
+///  * When working without safekeepers. In this situation it is important to match the lsn
+///    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
+///    to start the replication.
+pub async fn send_basebackup_tarball<'a, W>(
+    write: &'a mut W,
+    timeline: &'a Timeline,
+    req_lsn: Option<Lsn>,
+    prev_lsn: Option<Lsn>,
+    full_backup: bool,
+) -> anyhow::Result<()>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    // Compute postgres doesn't have any previous WAL files, but the first
+    // record that it's going to write needs to include the LSN of the
+    // previous record (xl_prev). We include prev_record_lsn in the
+    // "zenith.signal" file, so that postgres can read it during startup.
+    //
+    // We don't keep full history of record boundaries in the page server,
+    // however, only the predecessor of the latest record on each
+    // timeline. So we can only provide prev_record_lsn when you take a
+    // base backup at the end of the timeline, i.e. at last_record_lsn.
+    // Even at the end of the timeline, we sometimes don't have a valid
+    // prev_lsn value; that happens if the timeline was just branched from
+    // an old LSN and it doesn't have any WAL of its own yet. We will set
+    // prev_lsn to Lsn(0) if we cannot provide the correct value.
+    let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
+        // Backup was requested at a particular LSN. The caller should've
+        // already checked that it's a valid LSN.
+
+        // If the requested point is the end of the timeline, we can
+        // provide prev_lsn. (get_last_record_rlsn() might return it as
+        // zero, though, if no WAL has been generated on this timeline
+        // yet.)
+        let end_of_timeline = timeline.get_last_record_rlsn();
+        if req_lsn == end_of_timeline.last {
+            (end_of_timeline.prev, req_lsn)
+        } else {
+            (Lsn(0), req_lsn)
+        }
+    } else {
+        // Backup was requested at end of the timeline.
+        let end_of_timeline = timeline.get_last_record_rlsn();
+        (end_of_timeline.prev, end_of_timeline.last)
+    };
+
+    // Consolidate the derived and the provided prev_lsn values
+    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
+        if backup_prev != Lsn(0) {
+            ensure!(backup_prev == provided_prev_lsn);
+        }
+        provided_prev_lsn
+    } else {
+        backup_prev
+    };
+
+    info!(
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
+        backup_lsn, prev_lsn, full_backup
+    );
+
+    let basebackup = Basebackup {
+        ar: Builder::new_non_terminated(write),
+        timeline,
+        lsn: backup_lsn,
+        prev_record_lsn: prev_lsn,
+        full_backup,
+    };
+    basebackup
+        .send_tarball()
+        .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
+        .await
+}
+
 /// This is short-living object only for the time of tarball creation,
 /// created mostly to avoid passing a lot of parameters between various functions
 /// used for constructing tarball.
-pub struct Basebackup<'a, W>
+struct Basebackup<'a, W>
 where
-    W: Write,
+    W: AsyncWrite + Send + Sync + Unpin,
 {
-    ar: Builder<AbortableWrite<W>>,
-    timeline: &'a Arc<Timeline>,
-    pub lsn: Lsn,
+    ar: Builder<&'a mut W>,
+    timeline: &'a Timeline,
+    lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
-    finished: bool,
 }
 
-// Create basebackup with non-rel data in it.
-// Only include relational data if 'full_backup' is true.
-//
-// Currently we use empty lsn in two cases:
-//  * During the basebackup right after timeline creation
-//  * When working without safekeepers. In this situation it is important to match the lsn
-//    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
-//    to start the replication.
 impl<'a, W> Basebackup<'a, W>
 where
-    W: Write,
+    W: AsyncWrite + Send + Sync + Unpin,
 {
-    pub fn new(
-        write: W,
-        timeline: &'a Arc<Timeline>,
-        req_lsn: Option<Lsn>,
-        prev_lsn: Option<Lsn>,
-        full_backup: bool,
-    ) -> Result<Basebackup<'a, W>> {
-        // Compute postgres doesn't have any previous WAL files, but the first
-        // record that it's going to write needs to include the LSN of the
-        // previous record (xl_prev). We include prev_record_lsn in the
-        // "zenith.signal" file, so that postgres can read it during startup.
-        //
-        // We don't keep full history of record boundaries in the page server,
-        // however, only the predecessor of the latest record on each
-        // timeline. So we can only provide prev_record_lsn when you take a
-        // base backup at the end of the timeline, i.e. at last_record_lsn.
-        // Even at the end of the timeline, we sometimes don't have a valid
-        // prev_lsn value; that happens if the timeline was just branched from
-        // an old LSN and it doesn't have any WAL of its own yet. We will set
-        // prev_lsn to Lsn(0) if we cannot provide the correct value.
-        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
-            // Backup was requested at a particular LSN. The caller should've
-            // already checked that it's a valid LSN.
-
-            // If the requested point is the end of the timeline, we can
-            // provide prev_lsn. (get_last_record_rlsn() might return it as
-            // zero, though, if no WAL has been generated on this timeline
-            // yet.)
-            let end_of_timeline = timeline.get_last_record_rlsn();
-            if req_lsn == end_of_timeline.last {
-                (end_of_timeline.prev, req_lsn)
-            } else {
-                (Lsn(0), req_lsn)
-            }
-        } else {
-            // Backup was requested at end of the timeline.
-            let end_of_timeline = timeline.get_last_record_rlsn();
-            (end_of_timeline.prev, end_of_timeline.last)
-        };
-
-        // Consolidate the derived and the provided prev_lsn values
-        let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-            if backup_prev != Lsn(0) {
-                ensure!(backup_prev == provided_prev_lsn)
-            }
-            provided_prev_lsn
-        } else {
-            backup_prev
-        };
-
-        info!(
-            "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
-            backup_lsn, prev_lsn, full_backup
-        );
-
-        Ok(Basebackup {
-            ar: Builder::new(AbortableWrite::new(write)),
-            timeline,
-            lsn: backup_lsn,
-            prev_record_lsn: prev_lsn,
-            full_backup,
-            finished: false,
-        })
-    }
-
-    pub fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> anyhow::Result<()> {
         // TODO include checksum
 
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
-            let header = new_tar_header_dir(*dir)?;
-            self.ar.append(&header, &mut io::empty())?;
+            let header = new_tar_header_dir(dir)?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .context("could not add directory to basebackup tarball")?;
         }
 
-        // Send empty config files.
+        // Send config files.
         for filepath in PGDATA_SPECIAL_FILES.iter() {
             if *filepath == "pg_hba.conf" {
                 let data = PG_HBA.as_bytes();
                 let header = new_tar_header(filepath, data.len() as u64)?;
-                self.ar.append(&header, data)?;
+                self.ar
+                    .append(&header, data)
+                    .await
+                    .context("could not add config file to basebackup tarball")?;
             } else {
                 let header = new_tar_header(filepath, 0)?;
-                self.ar.append(&header, &mut io::empty())?;
+                self.ar
+                    .append(&header, &mut io::empty())
+                    .await
+                    .context("could not add config file to basebackup tarball")?;
             }
         }
 
@@ -152,24 +171,31 @@ where
             SlruKind::MultiXactOffsets,
             SlruKind::MultiXactMembers,
         ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
-                self.add_slru_segment(kind, segno)?;
+            for segno in
+                with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await?
+            {
+                self.add_slru_segment(kind, segno).await?;
             }
         }
 
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
-            self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
+        for ((spcnode, dbnode), has_relmap_file) in
+            with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await?
+        {
+            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
-                    self.add_rel(rel)?;
+                for rel in
+                    with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
+                        .await?
+                {
+                    self.add_rel(rel).await?;
                 }
             }
         }
-        for xid in self.timeline.list_twophase_files(self.lsn)? {
-            self.add_twophase_file(xid)?;
+        for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? {
+            self.add_twophase_file(xid).await?;
         }
 
         fail_point!("basebackup-before-control-file", |_| {
@@ -177,42 +203,46 @@ where
         });
 
         // Generate pg_control and bootstrap WAL segment.
-        self.add_pgcontrol_file()?;
-        self.ar.finish()?;
-        self.finished = true;
+        self.add_pgcontrol_file().await?;
+        self.ar.finish().await?;
         debug!("all tarred up!");
         Ok(())
     }
 
-    fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;
-
-        // Function that adds relation segment data to archive
-        let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
-            let file_name = tag.to_segfile_name(segment_index as u32);
-            let header = new_tar_header(&file_name, data.len() as u64)?;
-            self.ar.append(&header, data.as_slice())?;
-            Ok(())
-        };
+    async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
+        let nblocks =
+            with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
-            add_file(0, &vec![])?;
+            let file_name = tag.to_segfile_name(0);
+            let header = new_tar_header(&file_name, 0)?;
+            self.ar.append(&header, &mut io::empty()).await?;
             return Ok(());
         }
 
         // Add a file for each chunk of blocks (aka segment)
-        let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize);
-        for (seg, blocks) in chunks.into_iter().enumerate() {
+        let mut startblk = 0;
+        let mut seg = 0;
+        while startblk < nblocks {
+            let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
+
             let mut segment_data: Vec<u8> = vec![];
-            for blknum in blocks {
-                let img = self
-                    .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
+            for blknum in startblk..endblk {
+                let img = with_ondemand_download(|| {
+                    self.timeline
+                        .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                })
+                .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
-            add_file(seg, &segment_data)?;
+            let file_name = tag.to_segfile_name(seg as u32);
+            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
+            self.ar.append(&header, segment_data.as_slice()).await?;
+
+            seg += 1;
+            startblk = endblk;
         }
 
         Ok(())
@@ -221,14 +251,18 @@ where
     //
     // Generate SLRU segment files from repository.
     //
-    fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
+    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
+        let nblocks =
+            with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn))
+                .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
-            let img = self
-                .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
+            let img = with_ondemand_download(|| {
+                self.timeline
+                    .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+            })
+            .await?;
 
             if slru == SlruKind::Clog {
                 ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -241,7 +275,7 @@ where
 
         let segname = format!("{}/{:>04X}", slru.to_str(), segno);
         let header = new_tar_header(&segname, slru_buf.len() as u64)?;
-        self.ar.append(&header, slru_buf.as_slice())?;
+        self.ar.append(&header, slru_buf.as_slice()).await?;
 
         trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
         Ok(())
@@ -253,14 +287,16 @@ where
     // Each directory contains a PG_VERSION file, and the default database
     // directories also contain pg_filenode.map files.
     //
-    fn add_dbdir(
+    async fn add_dbdir(
         &mut self,
         spcnode: u32,
         dbnode: u32,
         has_relmap_file: bool,
     ) -> anyhow::Result<()> {
         let relmap_img = if has_relmap_file {
-            let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
+            let img =
+                with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn))
+                    .await?;
             ensure!(img.len() == 512);
             Some(img)
         } else {
@@ -270,14 +306,14 @@ where
         if spcnode == GLOBALTABLESPACE_OID {
             let pg_version_str = self.timeline.pg_version.to_string();
             let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes())?;
+            self.ar.append(&header, pg_version_str.as_bytes()).await?;
 
             info!("timeline.pg_version {}", self.timeline.pg_version);
 
             if let Some(img) = relmap_img {
                 // filenode map for global tablespace
                 let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..])?;
+                self.ar.append(&header, &img[..]).await?;
             } else {
                 warn!("global/pg_filenode.map is missing");
             }
@@ -293,9 +329,8 @@ where
             // XLOG_TBLSPC_DROP records. But we probably should just
             // throw an error on CREATE TABLESPACE in the first place.
             if !has_relmap_file
-                && self
-                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)?
+                && with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
+                    .await?
                     .is_empty()
             {
                 return Ok(());
@@ -306,18 +341,18 @@ where
             // Append dir path for each database
             let path = format!("base/{}", dbnode);
             let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty())?;
+            self.ar.append(&header, &mut io::empty()).await?;
 
             if let Some(img) = relmap_img {
                 let dst_path = format!("base/{}/PG_VERSION", dbnode);
 
                 let pg_version_str = self.timeline.pg_version.to_string();
                 let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes())?;
+                self.ar.append(&header, pg_version_str.as_bytes()).await?;
 
                 let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                 let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..])?;
+                self.ar.append(&header, &img[..]).await?;
             }
         };
         Ok(())
@@ -326,8 +361,8 @@ where
     //
     // Extract twophase state files
     //
-    fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn)?;
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+        let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -335,7 +370,7 @@ where
         buf.put_u32_le(crc);
         let path = format!("pg_twophase/{:>08X}", xid);
         let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..])?;
+        self.ar.append(&header, &buf[..]).await?;
 
         Ok(())
     }
@@ -344,7 +379,7 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
@@ -356,18 +391,18 @@ where
         } else {
             write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
         }
-        self.ar.append(
-            &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-            zenith_signal.as_bytes(),
-        )?;
+        self.ar
+            .append(
+                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
+                zenith_signal.as_bytes(),
+            )
+            .await?;
 
-        let checkpoint_bytes = self
-            .timeline
-            .get_checkpoint(self.lsn)
+        let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn))
+            .await
             .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_control_file(self.lsn)
+        let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn))
+            .await
             .context("failed get control bytes")?;
 
         let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
@@ -379,7 +414,7 @@ where
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..])?;
+        self.ar.append(&header, &pg_control_bytes[..]).await?;
 
         //send wal segment
         let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -391,24 +426,11 @@ where
             postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
                 .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
         ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
-        self.ar.append(&header, &wal_seg[..])?;
+        self.ar.append(&header, &wal_seg[..]).await?;
         Ok(())
     }
 }
 
-impl<'a, W> Drop for Basebackup<'a, W>
-where
-    W: Write,
-{
-    /// If the basebackup was not finished, prevent the Archive::drop() from
-    /// writing the end-of-archive marker.
-    fn drop(&mut self) {
-        if !self.finished {
-            self.ar.get_mut().abort();
-        }
-    }
-}
-
 //
 // Create new tarball entry header
 //
@@ -444,49 +466,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
     header.set_cksum();
     Ok(header)
 }
-
-/// A wrapper that passes through all data to the underlying Write,
-/// until abort() is called.
-///
-/// tar::Builder has an annoying habit of finishing the archive with
-/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
-/// even if an error occurs and we don't finish building the archive.
-/// We'd rather abort writing the tarball immediately than construct
-/// a seemingly valid but incomplete archive. This wrapper allows us
-/// to swallow the end-of-archive marker that Builder::drop() emits,
-/// without writing it to the underlying sink.
-///
-struct AbortableWrite<W> {
-    w: W,
-    aborted: bool,
-}
-
-impl<W> AbortableWrite<W> {
-    pub fn new(w: W) -> Self {
-        AbortableWrite { w, aborted: false }
-    }
-
-    pub fn abort(&mut self) {
-        self.aborted = true;
-    }
-}
-
-impl<W> Write for AbortableWrite<W>
-where
-    W: Write,
-{
-    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
-        if self.aborted {
-            Ok(data.len())
-        } else {
-            self.w.write(data)
-        }
-    }
-    fn flush(&mut self) -> io::Result<()> {
-        if self.aborted {
-            Ok(())
-        } else {
-            self.w.flush()
-        }
-    }
-}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 345f391e61..18ec1ac68b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,19 +7,20 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
+use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, task_mgr,
+    http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
     },
-    tenant_mgr, virtual_file,
+    tenant::mgr,
+    virtual_file,
 };
-use remote_storage::GenericRemoteStorage;
 use utils::{
     auth::JwtAuth,
     logging,
@@ -39,8 +40,6 @@ const FEATURES: &[&str] = &[
     "testing",
     #[cfg(feature = "fail/failpoints")]
     "fail/failpoints",
-    #[cfg(feature = "profiling")]
-    "profiling",
 ];
 
 fn version() -> String {
@@ -127,7 +126,7 @@ fn initialize_config(
             );
         }
         // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
+        let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
             format!(
                 "Failed to read pageserver config at '{}'",
                 cfg_file_path.display()
@@ -181,7 +180,7 @@ fn initialize_config(
     if update_config {
         info!("Writing pageserver config to '{}'", cfg_file_path.display());
 
-        std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
+        std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
             format!(
                 "Failed to write pageserver config to '{}'",
                 cfg_file_path.display()
@@ -201,8 +200,12 @@ fn initialize_config(
 }
 
 fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    // Initialize logging
     logging::init(conf.log_format)?;
+
+    // Print version to the log, and expose it as a prometheus metric too.
     info!("version: {}", version());
+    set_build_info_metric(GIT_VERSION);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
@@ -218,40 +221,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         )
     }
 
+    // Create and lock PID file. This ensures that there cannot be more than one
+    // pageserver process running at the same time.
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
     let lock_file =
         utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
     info!("Claimed pid file at {lock_file_path:?}");
 
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
+    // Ensure that the lock file is held even if the main thread of the process panics.
+    // We need to release the lock file only when the process exits.
     std::mem::forget(lock_file);
 
-    // TODO: Check that it looks like a valid repository before going further
+    // Bind the HTTP and libpq ports early, so that if they are in use by some other
+    // process, we error out early.
+    let http_addr = &conf.listen_http_addr;
+    info!("Starting pageserver http handler on {http_addr}");
+    let http_listener = tcp_listener::bind(http_addr)?;
 
-    // bind sockets before daemonizing so we report errors early and do not return until we are listening
-    info!(
-        "Starting pageserver http handler on {}",
-        conf.listen_http_addr
-    );
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
-
-    info!(
-        "Starting pageserver pg protocol handler on {}",
-        conf.listen_pg_addr
-    );
-    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
+    let pg_addr = &conf.listen_pg_addr;
+    info!("Starting pageserver pg protocol handler on {pg_addr}");
+    let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
+    // Install signal handlers
     let signals = signals::install_shutdown_handlers()?;
 
-    // start profiler (if enabled)
-    let profiler_guard = profiling::init_profiler(conf);
-
+    // Launch broker client
     WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
 
-    // initialize authentication for incoming connections
+    // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
-        AuthType::Trust | AuthType::MD5 => None,
+        AuthType::Trust => None,
         AuthType::NeonJWT => {
             // unwrap is ok because check is performed when creating config, so path is set and file exists
             let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
@@ -260,54 +259,54 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     };
     info!("Using auth: {:#?}", conf.auth_type);
 
-    match var("ZENITH_AUTH_TOKEN") {
-        Ok(v) => {
+    // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
+    match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
+        (old, Ok(v)) => {
             info!("Loaded JWT token for authentication with Safekeeper");
+            if let Ok(v_old) = old {
+                warn!(
+                    "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
+                );
+                if v_old != v {
+                    warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
+                }
+            }
             pageserver::config::SAFEKEEPER_AUTH_TOKEN
                 .set(Arc::new(v))
                 .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
         }
-        Err(VarError::NotPresent) => {
+        (Ok(v), _) => {
+            info!("Loaded JWT token for authentication with Safekeeper");
+            warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
+            pageserver::config::SAFEKEEPER_AUTH_TOKEN
+                .set(Arc::new(v))
+                .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
+        }
+        (_, Err(VarError::NotPresent)) => {
             info!("No JWT token for authentication with Safekeeper detected");
         }
-        Err(e) => {
+        (_, Err(e)) => {
             return Err(e).with_context(|| {
-                "Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable"
+                "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
             })
         }
     };
 
-    let remote_storage = conf
-        .remote_storage_config
-        .as_ref()
-        .map(GenericRemoteStorage::from_config)
-        .transpose()
-        .context("Failed to init generic remote storage")?;
+    // Set up remote storage client
+    let remote_storage = create_remote_storage_client(conf)?;
 
-    let (init_result_sender, init_result_receiver) =
-        std::sync::mpsc::channel::<anyhow::Result<()>>();
-    let storage_for_spawn = remote_storage.clone();
-    let _handler = BACKGROUND_RUNTIME.spawn(async move {
-        let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
-        init_result_sender.send(result)
-    });
-    match init_result_receiver.recv() {
-        Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
-        Err(_sender_dropped_err) => {
-            anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
-        }
-    }
+    // Scan the local 'tenants/' directory and start loading the tenants
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
 
-    // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
-    // bind before launching separate thread so the error reported before startup exits
-
-    // Create a Service from the router above to handle incoming requests.
+    // Start up the service to handle HTTP mgmt API request. We created the
+    // listener earlier already.
     {
         let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
 
-        let router = http::make_router(conf, auth.clone(), remote_storage)?;
-        let service =
-            utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
+        let router = http::make_router(conf, auth.clone(), remote_storage)?
+            .build()
+            .map_err(|err| anyhow!(err))?;
+        let service = utils::http::RouterService::new(router).unwrap();
         let server = hyper::Server::from_tcp(http_listener)?
             .serve(service)
             .with_graceful_shutdown(task_mgr::shutdown_watcher());
@@ -324,10 +323,31 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 Ok(())
             },
         );
+
+        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            task_mgr::spawn(
+                MGMT_REQUEST_RUNTIME.handle(),
+                TaskKind::MetricsCollection,
+                None,
+                None,
+                "consumption metrics collection",
+                true,
+                async move {
+                    pageserver::consumption_metrics::collect_metrics(
+                        metric_collection_endpoint,
+                        conf.metric_collection_interval,
+                        conf.id,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                },
+            );
+        }
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
-    // for each connection.
+    // for each connection. We created the listener earlier already.
     task_mgr::spawn(
         COMPUTE_REQUEST_RUNTIME.handle(),
         TaskKind::LibpqEndpointListener,
@@ -340,8 +360,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         },
     );
 
-    set_build_info_metric(GIT_VERSION);
-
     // All started up! Now just sit and wait for shutdown signal.
     signals.handle(|signal| match signal {
         Signal::Quit => {
@@ -349,7 +367,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "Got {}. Terminating in immediate shutdown mode",
                 signal.name()
             );
-            profiling::exit_profiler(conf, &profiler_guard);
             std::process::exit(111);
         }
 
@@ -358,13 +375,42 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "Got {}. Terminating gracefully in fast shutdown mode",
                 signal.name()
             );
-            profiling::exit_profiler(conf, &profiler_guard);
             BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
             unreachable!()
         }
     })
 }
 
+fn create_remote_storage_client(
+    conf: &'static PageServerConf,
+) -> anyhow::Result<Option<GenericRemoteStorage>> {
+    let config = if let Some(config) = &conf.remote_storage_config {
+        config
+    } else {
+        // No remote storage configured.
+        return Ok(None);
+    };
+
+    // Create the client
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+
+    // If `test_remote_failures` is non-zero, wrap the client with a
+    // wrapper that simulates failures.
+    if conf.test_remote_failures > 0 {
+        if !cfg!(feature = "testing") {
+            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
+        }
+        info!(
+            "Simulating remote failures for first {} attempts of each op",
+            conf.test_remote_failures
+        );
+        remote_storage =
+            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+    }
+
+    Ok(Some(remote_storage))
+}
+
 fn cli() -> Command {
     Command::new("Neon page server")
         .about("Materializes WAL stream to pages and serves them to the postgres")
diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs
index b1484ac45a..9da173c873 100644
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
 }
 
 fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
     println!("{control_file:?}");
     let control_file_initdb = Lsn(control_file.checkPoint);
     println!(
@@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> {
 }
 
 fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(&path)?;
+    let metadata_bytes = std::fs::read(path)?;
     let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
     println!("Current metadata:\n{meta:?}");
     let mut update_meta = false;
@@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
 
     if update_meta {
         let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(&path, &metadata_bytes)?;
+        std::fs::write(path, metadata_bytes)?;
     }
 
     Ok(())
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 48e9f32276..7b99d98581 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,6 +12,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 
 use once_cell::sync::OnceCell;
+use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
@@ -26,14 +27,15 @@ use utils::{
     postgres_backend::AuthType,
 };
 
+use crate::tenant::config::TenantConf;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
-use crate::tenant_config::{TenantConf, TenantConfOpt};
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
 pub mod defaults {
-    use crate::tenant_config::defaults::*;
+    use crate::tenant::config::defaults::*;
     use const_format::formatcp;
 
     pub use pageserver_api::{
@@ -55,6 +57,8 @@ pub mod defaults {
     pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     ///
     /// Default built-in configuration file.
     ///
@@ -78,6 +82,8 @@ pub mod defaults {
 
 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
 
+#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -132,16 +138,22 @@ pub struct PageServerConf {
     pub auth_validation_public_key_path: Option<PathBuf>,
     pub remote_storage_config: Option<RemoteStorageConfig>,
 
-    pub profiling: ProfilingConfig,
     pub default_tenant_conf: TenantConf,
 
     /// Storage broker endpoints to connect to.
     pub broker_endpoint: Uri,
+    pub broker_keepalive_interval: Duration,
 
     pub log_format: LogFormat,
 
     /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
+
+    // How often to collect metrics and send them to the metrics endpoint.
+    pub metric_collection_interval: Duration,
+    pub metric_collection_endpoint: Option<Url>,
+
+    pub test_remote_failures: u64,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -152,25 +164,6 @@ pub struct PageServerConf {
 /// startup code to the connection code through a dozen layers.
 pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum ProfilingConfig {
-    Disabled,
-    PageRequests,
-}
-
-impl FromStr for ProfilingConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
-        let result = match s {
-            "disabled"  => ProfilingConfig::Disabled,
-            "page_requests"  => ProfilingConfig::PageRequests,
-            _ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
-        };
-        Ok(result)
-    }
-}
-
 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
 pub enum BuilderValue<T> {
@@ -213,12 +206,17 @@ struct PageServerConfigBuilder {
 
     id: BuilderValue<NodeId>,
 
-    profiling: BuilderValue<ProfilingConfig>,
     broker_endpoint: BuilderValue<Uri>,
+    broker_keepalive_interval: BuilderValue<Duration>,
 
     log_format: BuilderValue<LogFormat>,
 
     concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
+
+    metric_collection_interval: BuilderValue<Duration>,
+    metric_collection_endpoint: BuilderValue<Option<Url>>,
+
+    test_remote_failures: BuilderValue<u64>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -243,13 +241,23 @@ impl Default for PageServerConfigBuilder {
             auth_validation_public_key_path: Set(None),
             remote_storage_config: Set(None),
             id: NotSet,
-            profiling: Set(ProfilingConfig::Disabled),
             broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint")),
+            broker_keepalive_interval: Set(humantime::parse_duration(
+                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
+            )
+            .expect("cannot parse default keepalive interval")),
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
+            metric_collection_interval: Set(humantime::parse_duration(
+                DEFAULT_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default metric collection interval")),
+            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
+
+            test_remote_failures: Set(0),
         }
     }
 }
@@ -310,12 +318,12 @@ impl PageServerConfigBuilder {
         self.broker_endpoint = BuilderValue::Set(broker_endpoint)
     }
 
-    pub fn id(&mut self, node_id: NodeId) {
-        self.id = BuilderValue::Set(node_id)
+    pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
+        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
     }
 
-    pub fn profiling(&mut self, profiling: ProfilingConfig) {
-        self.profiling = BuilderValue::Set(profiling)
+    pub fn id(&mut self, node_id: NodeId) {
+        self.id = BuilderValue::Set(node_id)
     }
 
     pub fn log_format(&mut self, log_format: LogFormat) {
@@ -326,6 +334,18 @@ impl PageServerConfigBuilder {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
 
+    pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
+        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
+    }
+
+    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
+        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
+    }
+
+    pub fn test_remote_failures(&mut self, fail_first: u64) {
+        self.test_remote_failures = BuilderValue::Set(fail_first);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         Ok(PageServerConf {
             listen_pg_addr: self
@@ -359,18 +379,29 @@ impl PageServerConfigBuilder {
                 .remote_storage_config
                 .ok_or(anyhow!("missing remote_storage_config"))?,
             id: self.id.ok_or(anyhow!("missing id"))?,
-            profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
             // TenantConf is handled separately
             default_tenant_conf: TenantConf::default(),
             broker_endpoint: self
                 .broker_endpoint
                 .ok_or(anyhow!("No broker endpoints provided"))?,
+            broker_keepalive_interval: self
+                .broker_keepalive_interval
+                .ok_or(anyhow!("No broker keepalive interval provided"))?,
             log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
             concurrent_tenant_size_logical_size_queries: self
                 .concurrent_tenant_size_logical_size_queries
                 .ok_or(anyhow!(
                     "missing concurrent_tenant_size_logical_size_queries"
                 ))?,
+            metric_collection_interval: self
+                .metric_collection_interval
+                .ok_or(anyhow!("missing metric_collection_interval"))?,
+            metric_collection_endpoint: self
+                .metric_collection_endpoint
+                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
+            test_remote_failures: self
+                .test_remote_failures
+                .ok_or(anyhow!("missing test_remote_failuers"))?,
         })
     }
 }
@@ -530,8 +561,8 @@ impl PageServerConf {
                     t_conf = Self::parse_toml_tenant_conf(item)?;
                 }
                 "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
-                "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
+                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
                     LogFormat::from_config(&parse_toml_string(key, item)?)?
                 ),
@@ -541,6 +572,13 @@ impl PageServerConf {
                     let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
                     ConfigurableSemaphore::new(permits)
                 }),
+                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
+                "metric_collection_endpoint" => {
+                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
+                    builder.metric_collection_endpoint(Some(endpoint));
+                },
+
+                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -656,11 +694,14 @@ impl PageServerConf {
             auth_type: AuthType::Trust,
             auth_validation_public_key_path: None,
             remote_storage_config: None,
-            profiling: ProfilingConfig::Disabled,
-            default_tenant_conf: TenantConf::dummy_conf(),
+            default_tenant_conf: TenantConf::default(),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+            broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+            metric_collection_interval: Duration::from_secs(60),
+            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+            test_remote_failures: 0,
         }
     }
 }
@@ -791,6 +832,8 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10
 
+metric_collection_interval = '222 s'
+metric_collection_endpoint = 'http://localhost:80/metrics'
 log_format = 'json'
 
 "#;
@@ -826,11 +869,18 @@ log_format = 'json'
                 auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
-                profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: humantime::parse_duration(
+                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
+                )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: humantime::parse_duration(
+                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
+                )?,
+                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
+                test_remote_failures: 0,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -869,11 +919,14 @@ log_format = 'json'
                 auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
-                profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: Duration::from_secs(222),
+                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
+                test_remote_failures: 0,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
new file mode 100644
index 0000000000..c411a9e025
--- /dev/null
+++ b/pageserver/src/consumption_metrics.rs
@@ -0,0 +1,324 @@
+//!
+//! Periodically collect consumption metrics for all active tenants
+//! and push them to a HTTP endpoint.
+//! Cache metrics to send only the updated ones.
+//!
+
+use anyhow;
+use tracing::*;
+use utils::id::NodeId;
+use utils::id::TimelineId;
+
+use crate::task_mgr;
+use crate::tenant::mgr;
+use pageserver_api::models::TenantState;
+use utils::id::TenantId;
+
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use std::collections::HashMap;
+use std::fmt;
+use std::str::FromStr;
+use std::time::Duration;
+
+use chrono::{DateTime, Utc};
+use rand::Rng;
+use reqwest::Url;
+
+/// ConsumptionMetric struct that defines the format for one metric entry
+/// i.e.
+///
+/// ```json
+/// {
+/// "metric": "remote_storage_size",
+/// "type": "absolute",
+/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
+/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
+/// "time": "2022-12-28T11:07:19.317310284Z",
+/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
+/// "value": 12345454,
+/// }
+/// ```
+#[serde_as]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct ConsumptionMetric {
+    pub metric: ConsumptionMetricKind,
+    #[serde(rename = "type")]
+    pub metric_type: &'static str,
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timeline_id: Option<TimelineId>,
+    pub time: DateTime<Utc>,
+    pub idempotency_key: String,
+    pub value: u64,
+}
+
+impl ConsumptionMetric {
+    pub fn new_absolute<R: Rng + ?Sized>(
+        metric: ConsumptionMetricKind,
+        tenant_id: TenantId,
+        timeline_id: Option<TimelineId>,
+        value: u64,
+        node_id: NodeId,
+        rng: &mut R,
+    ) -> Self {
+        Self {
+            metric,
+            metric_type: "absolute",
+            tenant_id,
+            timeline_id,
+            time: Utc::now(),
+            // key that allows metric collector to distinguish unique events
+            idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
+            value,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ConsumptionMetricKind {
+    /// Amount of WAL produced , by a timeline, i.e. last_record_lsn
+    /// This is an absolute, per-timeline metric.
+    WrittenSize,
+    /// Size of all tenant branches including WAL
+    /// This is an absolute, per-tenant metric.
+    /// This is the same metric that tenant/tenant_id/size endpoint returns.
+    SyntheticStorageSize,
+    /// Size of all the layer files in the tenant's directory on disk on the pageserver.
+    /// This is an absolute, per-tenant metric.
+    /// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
+    ResidentSize,
+    /// Size of the remote storage (S3) directory.
+    /// This is an absolute, per-tenant metric.
+    RemoteStorageSize,
+    /// Logical size of the data in the timeline
+    /// This is an absolute, per-timeline metric
+    TimelineLogicalSize,
+}
+
+impl FromStr for ConsumptionMetricKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "written_size" => Ok(Self::WrittenSize),
+            "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
+            "resident_size" => Ok(Self::ResidentSize),
+            "remote_storage_size" => Ok(Self::RemoteStorageSize),
+            "timeline_logical_size" => Ok(Self::TimelineLogicalSize),
+            _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
+        }
+    }
+}
+
+impl fmt::Display for ConsumptionMetricKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(match self {
+            ConsumptionMetricKind::WrittenSize => "written_size",
+            ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
+            ConsumptionMetricKind::ResidentSize => "resident_size",
+            ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
+            ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ConsumptionMetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: ConsumptionMetricKind,
+}
+
+#[derive(serde::Serialize)]
+struct EventChunk<'a> {
+    events: &'a [ConsumptionMetric],
+}
+
+/// Main thread that serves metrics collection
+pub async fn collect_metrics(
+    metric_collection_endpoint: &Url,
+    metric_collection_interval: Duration,
+    node_id: NodeId,
+) -> anyhow::Result<()> {
+    let mut ticker = tokio::time::interval(metric_collection_interval);
+
+    info!("starting collect_metrics");
+
+    // define client here to reuse it for all requests
+    let client = reqwest::Client::new();
+    let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
+
+    loop {
+        tokio::select! {
+            _ = task_mgr::shutdown_watcher() => {
+                info!("collect_metrics received cancellation request");
+                return Ok(());
+            },
+            _ = ticker.tick() => {
+                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?;
+            }
+        }
+    }
+}
+
+/// One iteration of metrics collection
+///
+/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
+/// Cache metrics to avoid sending the same metrics multiple times.
+pub async fn collect_metrics_task(
+    client: &reqwest::Client,
+    cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
+    metric_collection_endpoint: &reqwest::Url,
+    node_id: NodeId,
+) -> anyhow::Result<()> {
+    let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
+    trace!(
+        "starting collect_metrics_task. metric_collection_endpoint: {}",
+        metric_collection_endpoint
+    );
+
+    // get list of tenants
+    let tenants = mgr::list_tenants().await;
+
+    // iterate through list of Active tenants and collect metrics
+    for (tenant_id, tenant_state) in tenants {
+        if tenant_state != TenantState::Active {
+            continue;
+        }
+
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
+
+        let mut tenant_resident_size = 0;
+
+        // iterate through list of timelines in tenant
+        for timeline in tenant.list_timelines().iter() {
+            // collect per-timeline metrics only for active timelines
+            if timeline.is_active() {
+                let timeline_written_size = u64::from(timeline.get_last_record_lsn());
+
+                current_metrics.push((
+                    ConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: ConsumptionMetricKind::WrittenSize,
+                    },
+                    timeline_written_size,
+                ));
+
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                // Only send timeline logical size when it is fully calculated.
+                if is_exact {
+                    current_metrics.push((
+                        ConsumptionMetricsKey {
+                            tenant_id,
+                            timeline_id: Some(timeline.timeline_id),
+                            metric: ConsumptionMetricKind::TimelineLogicalSize,
+                        },
+                        timeline_logical_size,
+                    ));
+                }
+            }
+
+            let timeline_resident_size = timeline.get_resident_physical_size();
+            tenant_resident_size += timeline_resident_size;
+        }
+
+        let tenant_remote_size = tenant.get_remote_size().await?;
+        debug!(
+            "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
+            tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
+        );
+
+        current_metrics.push((
+            ConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: ConsumptionMetricKind::ResidentSize,
+            },
+            tenant_resident_size,
+        ));
+
+        current_metrics.push((
+            ConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: ConsumptionMetricKind::RemoteStorageSize,
+            },
+            tenant_remote_size,
+        ));
+
+        // TODO add SyntheticStorageSize metric
+    }
+
+    // Filter metrics
+    current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+        Some(val) => val != curr_val,
+        None => true,
+    });
+
+    if current_metrics.is_empty() {
+        trace!("no new metrics to send");
+        return Ok(());
+    }
+
+    // Send metrics.
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    const CHUNK_SIZE: usize = 1000;
+    let chunks = current_metrics.chunks(CHUNK_SIZE);
+
+    let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
+
+    for chunk in chunks {
+        chunk_to_send.clear();
+
+        // this code block is needed to convince compiler
+        // that rng is not reused aroung await point
+        {
+            // enrich metrics with timestamp and metric_kind before sending
+            let mut rng = rand::thread_rng();
+            chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
+                ConsumptionMetric::new_absolute(
+                    curr_key.metric,
+                    curr_key.tenant_id,
+                    curr_key.timeline_id,
+                    *curr_val,
+                    node_id,
+                    &mut rng,
+                )
+            }));
+        }
+
+        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
+            events: &chunk_to_send,
+        })
+        .expect("ConsumptionMetric should not fail serialization");
+
+        let res = client
+            .post(metric_collection_endpoint.clone())
+            .json(&chunk_json)
+            .send()
+            .await;
+
+        match res {
+            Ok(res) => {
+                if res.status().is_success() {
+                    // update cached metrics after they were sent successfully
+                    for (curr_key, curr_val) in chunk.iter() {
+                        cached_metrics.insert(curr_key.clone(), *curr_val);
+                    }
+                } else {
+                    error!("metrics endpoint refused the sent metrics: {:?}", res);
+                }
+            }
+            Err(err) => {
+                error!("failed to send metrics: {:?}", err);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b372410c0d..f9b8a81dad 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -77,16 +77,6 @@ paths:
         schema:
           type: string
           format: hex
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
-      - name: include-non-incremental-physical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_physical_size_non_incremental
     get:
       description: Get timelines for tenant
       responses:
@@ -139,17 +129,6 @@ paths:
           format: hex
     get:
       description: Get info about the timeline
-      parameters:
-        - name: include-non-incremental-logical-size
-          in: query
-          schema:
-            type: string
-          description: Controls calculation of current_logical_size_non_incremental
-        - name: include-non-incremental-physical-size
-          in: query
-          schema:
-            type: string
-            description: Controls calculation of current_physical_size_non_incremental
       responses:
         "200":
           description: TimelineInfo
@@ -779,10 +758,6 @@ components:
           type: integer
         current_physical_size:
           type: integer
-        current_logical_size_non_incremental:
-          type: integer
-        current_physical_size_non_incremental:
-          type: integer
         wal_source_connstr:
           type: string
         last_received_msg_lsn:
@@ -795,37 +770,6 @@ components:
         latest_gc_cutoff_lsn:
           type: string
           format: hex
-
-        # These 'local' and 'remote' fields just duplicate some of the fields
-        # above. They are kept for backwards-compatibility. They can be removed,
-        # when the control plane has been updated to look at the above fields
-        # directly.
-        local:
-          $ref: "#/components/schemas/LocalTimelineInfo"
-        remote:
-          $ref: "#/components/schemas/RemoteTimelineInfo"
-
-    LocalTimelineInfo:
-      type: object
-      properties:
-        ancestor_timeline_id:
-          type: string
-          format: hex
-        ancestor_lsn:
-          type: string
-          format: hex
-        current_logical_size:
-          type: integer
-        current_physical_size:
-          type: integer
-    RemoteTimelineInfo:
-      type: object
-      required:
-        - remote_consistent_lsn
-      properties:
-        remote_consistent_lsn:
-          type: string
-          format: hex
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0ef555c4aa..1c5eacd362 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4,16 +4,17 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use remote_storage::GenericRemoteStorage;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use super::models::{
-    LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
-    TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::tenant::Timeline;
-use crate::tenant_config::TenantConfOpt;
-use crate::{config::PageServerConf, tenant_mgr};
+use crate::tenant::config::TenantConfOpt;
+use crate::tenant::{with_ondemand_download, Timeline};
+use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
     auth::JwtAuth,
     http::{
@@ -30,8 +31,6 @@ use utils::{
 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
 use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
-#[cfg(feature = "testing")]
-use crate::CheckpointConfig;
 
 struct State {
     conf: &'static PageServerConf,
@@ -79,19 +78,23 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 }
 
 // Helper function to construct a TimelineInfo struct for a timeline
-fn build_timeline_info(
+async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
-    include_non_incremental_physical_size: bool,
 ) -> anyhow::Result<TimelineInfo> {
     let mut info = build_timeline_info_common(timeline)?;
     if include_non_incremental_logical_size {
-        info.current_logical_size_non_incremental =
-            Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
-    }
-    if include_non_incremental_physical_size {
-        info.current_physical_size_non_incremental =
-            Some(timeline.get_physical_size_non_incremental()?)
+        // XXX we should be using spawn_ondemand_logical_size_calculation here.
+        // Otherwise, if someone deletes the timeline / detaches the tenant while
+        // we're executing this function, we will outlive the timeline on-disk state.
+        info.current_logical_size_non_incremental = Some(
+            timeline
+                .get_current_logical_size_non_incremental(
+                    info.last_record_lsn,
+                    CancellationToken::new(),
+                )
+                .await?,
+        );
     }
     Ok(info)
 }
@@ -117,13 +120,13 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         lsn @ Lsn(_) => Some(lsn),
     };
     let current_logical_size = match timeline.get_current_logical_size() {
-        Ok(size) => Some(size),
+        Ok((size, _)) => Some(size),
         Err(err) => {
             error!("Timeline info creation failed to get current logical size: {err:?}");
             None
         }
     };
-    let current_physical_size = Some(timeline.get_physical_size());
+    let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
     let state = timeline.current_state();
     let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
 
@@ -140,25 +143,13 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         current_logical_size,
         current_physical_size,
         current_logical_size_non_incremental: None,
-        current_physical_size_non_incremental: None,
+        timeline_dir_layer_file_size_sum: None,
         wal_source_connstr,
         last_received_msg_lsn,
         last_received_msg_ts,
         pg_version: timeline.pg_version,
 
         state,
-
-        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
-        // with the control plane.
-        local: LocalTimelineInfo {
-            ancestor_timeline_id,
-            ancestor_lsn,
-            current_logical_size,
-            current_physical_size,
-        },
-        remote: RemoteTimelineInfo {
-            remote_consistent_lsn: Some(remote_consistent_lsn),
-        },
     };
     Ok(info)
 }
@@ -179,7 +170,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     match tenant.create_timeline(
@@ -205,29 +196,26 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let include_non_incremental_logical_size =
         query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
     check_permission(&request, Some(tenant_id))?;
 
     let response_data = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
             .await
             .map_err(ApiError::NotFound)?;
         let timelines = tenant.list_timelines();
 
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
-            let timeline_info = build_timeline_info(
-                &timeline,
-                include_non_incremental_logical_size,
-                include_non_incremental_physical_size,
-            )
-            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
-            .map_err(ApiError::InternalServerError)?;
+            let timeline_info =
+                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                    .await
+                    .context(
+                        "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
+                    )
+                    .map_err(ApiError::InternalServerError)?;
 
             response_data.push(timeline_info);
         }
-
         Ok(response_data)
     }
     .instrument(info_span!("timeline_list", tenant = %tenant_id))
@@ -271,12 +259,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let include_non_incremental_logical_size =
         query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
     check_permission(&request, Some(tenant_id))?;
 
     let timeline_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
             .await
             .map_err(ApiError::NotFound)?;
 
@@ -284,13 +270,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .get_timeline(timeline_id, false)
             .map_err(ApiError::NotFound)?;
 
-        let timeline_info = build_timeline_info(
-            &timeline,
-            include_non_incremental_logical_size,
-            include_non_incremental_physical_size,
-        )
-        .context("Failed to get local timeline info: {e:#}")
-        .map_err(ApiError::InternalServerError)?;
+        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
+            .await
+            .context("Failed to get local timeline info: {e:#}")
+            .map_err(ApiError::InternalServerError)?;
 
         Ok::<_, ApiError>(timeline_info)
     }
@@ -311,14 +294,15 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
-    let timeline = tenant_mgr::get_tenant(tenant_id, true)
+    let timeline = mgr::get_tenant(tenant_id, true)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
-    let result = match timeline
-        .find_lsn_for_timestamp(timestamp_pg)
-        .map_err(ApiError::InternalServerError)?
-    {
+    let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let result = match result {
         LsnForTimestamp::Present(lsn) => format!("{lsn}"),
         LsnForTimestamp::Future(_lsn) => "future".into(),
         LsnForTimestamp::Past(_lsn) => "past".into(),
@@ -338,7 +322,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
 
     if let Some(remote_storage) = &state.remote_storage {
         // FIXME: distinguish between "Tenant already exists" and other errors
-        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
             .map_err(ApiError::InternalServerError)?;
@@ -356,7 +340,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    tenant_mgr::delete_timeline(tenant_id, timeline_id)
+    mgr::delete_timeline(tenant_id, timeline_id)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await
         // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -373,7 +357,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
 
     let state = get_state(&request);
     let conf = state.conf;
-    tenant_mgr::detach_tenant(conf, tenant_id)
+    mgr::detach_tenant(conf, tenant_id)
         .instrument(info_span!("tenant_detach", tenant = %tenant_id))
         .await
         // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
@@ -388,7 +372,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     check_permission(&request, Some(tenant_id))?;
 
     let state = get_state(&request);
-    tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
         .instrument(info_span!("load", tenant = %tenant_id))
         .await
         .map_err(ApiError::InternalServerError)?;
@@ -402,7 +386,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
 
     let state = get_state(&request);
     let conf = state.conf;
-    tenant_mgr::ignore_tenant(conf, tenant_id)
+    mgr::ignore_tenant(conf, tenant_id)
         .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
         .await
         // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
@@ -415,7 +399,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
-    let response_data = tenant_mgr::list_tenants()
+    let response_data = mgr::list_tenants()
         .instrument(info_span!("tenant_list"))
         .await
         .iter()
@@ -435,12 +419,12 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     check_permission(&request, Some(tenant_id))?;
 
     let tenant_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;
 
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
         for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.get_physical_size();
+            current_physical_size += timeline.layer_size_sum().approximate_is_ok();
         }
 
         let state = tenant.current_state();
@@ -462,7 +446,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -583,7 +567,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
 
     let state = get_state(&request);
 
-    let new_tenant = tenant_mgr::create_tenant(
+    let new_tenant = mgr::create_tenant(
         state.conf,
         tenant_conf,
         target_tenant_id,
@@ -685,7 +669,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
     }
 
     let state = get_state(&request);
-    tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
+    mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
         .instrument(info_span!("tenant_config", tenant = ?tenant_id))
         .await
         // FIXME: `update_tenant_config` can fail because of both user and internal errors.
@@ -737,7 +721,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -754,17 +738,17 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::NotFound)?;
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)?;
-    timeline
-        .compact()
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
         .await
+        .context("spawn compaction task")
         .map_err(ApiError::InternalServerError)?;
 
+    let result: anyhow::Result<()> = result_receiver
+        .await
+        .context("receive compaction result")
+        .map_err(ApiError::InternalServerError)?;
+    result.map_err(ApiError::InternalServerError)?;
+
     json_response(StatusCode::OK, ())
 }
 
@@ -775,20 +759,63 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
     timeline
-        .checkpoint(CheckpointConfig::Forced)
+        .freeze_and_flush()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    timeline
+        .compact()
         .await
         .map_err(ApiError::InternalServerError)?;
 
     json_response(StatusCode::OK, ())
 }
 
+async fn timeline_download_remote_layers_handler_post(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    match timeline.spawn_download_all_remote_layers().await {
+        Ok(st) => json_response(StatusCode::ACCEPTED, st),
+        Err(st) => json_response(StatusCode::CONFLICT, st),
+    }
+}
+
+async fn timeline_download_remote_layers_handler_get(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    let info = timeline
+        .get_download_all_remote_layers_task_info()
+        .context("task never started since last pageserver process start")
+        .map_err(ApiError::NotFound)?;
+    json_response(StatusCode::OK, info)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -873,6 +900,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
             testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_post,
+        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_get,
+        )
         .delete(
             "/v1/tenant/:tenant_id/timeline/:timeline_id",
             timeline_delete_handler,
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 642e41765b..ca1514dd00 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,12 +2,13 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::fs::File;
-use std::io::{Read, Seek, SeekFrom};
 use std::path::{Path, PathBuf};
 
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
+use futures::StreamExt;
+use tokio::io::{AsyncRead, AsyncReadExt};
+use tokio_tar::Archive;
 use tracing::*;
 use walkdir::WalkDir;
 
@@ -42,7 +43,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// This is currently only used to import a cluster freshly created by initdb.
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
-pub fn import_timeline_from_postgres_datadir(
+pub async fn import_timeline_from_postgres_datadir(
     tline: &Timeline,
     pgdata_path: &Path,
     pgdata_lsn: Lsn,
@@ -65,9 +66,11 @@ pub fn import_timeline_from_postgres_datadir(
             let absolute_path = entry.path();
             let relative_path = absolute_path.strip_prefix(pgdata_path)?;
 
-            let file = File::open(absolute_path)?;
+            let mut file = tokio::fs::File::open(absolute_path).await?;
             let len = metadata.len() as usize;
-            if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
+            if let Some(control_file) =
+                import_file(&mut modification, relative_path, &mut file, len).await?
+            {
                 pg_control = Some(control_file);
             }
             modification.flush()?;
@@ -96,18 +99,19 @@ pub fn import_timeline_from_postgres_datadir(
         tline,
         Lsn(pg_control.checkPointCopy.redo),
         pgdata_lsn,
-    )?;
+    )
+    .await?;
 
     Ok(())
 }
 
 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
-fn import_rel<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_rel(
+    modification: &mut DatadirModification<'_>,
     path: &Path,
     spcoid: Oid,
     dboid: Oid,
-    mut reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> anyhow::Result<()> {
     // Does it look like a relation file?
@@ -148,7 +152,7 @@ fn import_rel<Reader: Read>(
     }
 
     loop {
-        let r = reader.read_exact(&mut buf);
+        let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
                 modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
@@ -181,19 +185,19 @@ fn import_rel<Reader: Read>(
 
 /// Import an SLRU segment file
 ///
-fn import_slru<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_slru(
+    modification: &mut DatadirModification<'_>,
     slru: SlruKind,
     path: &Path,
-    mut reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
-) -> Result<()> {
-    trace!("importing slru file {}", path.display());
+) -> anyhow::Result<()> {
+    info!("importing slru file {path:?}");
 
     let mut buf: [u8; 8192] = [0u8; 8192];
     let filename = &path
         .file_name()
-        .expect("missing slru filename")
+        .with_context(|| format!("missing slru filename for path {path:?}"))?
         .to_string_lossy();
     let segno = u32::from_str_radix(filename, 16)?;
 
@@ -206,7 +210,7 @@ fn import_slru<Reader: Read>(
 
     let mut rpageno = 0;
     loop {
-        let r = reader.read_exact(&mut buf);
+        let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
                 modification.put_slru_page_image(
@@ -237,14 +241,20 @@ fn import_slru<Reader: Read>(
 
 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
+async fn import_wal(
+    walpath: &Path,
+    tline: &Timeline,
+    startpoint: Lsn,
+    endpoint: Lsn,
+) -> anyhow::Result<()> {
+    use std::io::Read;
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint)?;
+    let mut walingest = WalIngest::new(tline, startpoint).await?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -260,14 +270,15 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
         }
 
         // Slurp the WAL file
-        let mut file = File::open(&path)?;
+        let mut file = std::fs::File::open(&path)?;
 
         if offset > 0 {
-            file.seek(SeekFrom::Start(offset as u64))?;
+            use std::io::Seek;
+            file.seek(std::io::SeekFrom::Start(offset as u64))?;
         }
 
         let nread = file.read_to_end(&mut buf)?;
-        if nread != WAL_SEGMENT_SIZE - offset as usize {
+        if nread != WAL_SEGMENT_SIZE - offset {
             // Maybe allow this for .partial files?
             error!("read only {} bytes from WAL file", nread);
         }
@@ -279,7 +290,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .await?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -303,9 +316,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
     Ok(())
 }
 
-pub fn import_basebackup_from_tar<Reader: Read>(
+pub async fn import_basebackup_from_tar(
     tline: &Timeline,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     base_lsn: Lsn,
 ) -> Result<()> {
     info!("importing base at {base_lsn}");
@@ -315,21 +328,24 @@ pub fn import_basebackup_from_tar<Reader: Read>(
     let mut pg_control: Option<ControlFileData> = None;
 
     // Import base
-    for base_tar_entry in tar::Archive::new(reader).entries()? {
-        let entry = base_tar_entry?;
+    let mut entries = Archive::new(reader).entries()?;
+    while let Some(base_tar_entry) = entries.next().await {
+        let mut entry = base_tar_entry?;
         let header = entry.header();
         let len = header.entry_size()? as usize;
         let file_path = header.path()?.into_owned();
 
         match header.entry_type() {
-            tar::EntryType::Regular => {
-                if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
+            tokio_tar::EntryType::Regular => {
+                if let Some(res) =
+                    import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+                {
                     // We found the pg_control file.
                     pg_control = Some(res);
                 }
                 modification.flush()?;
             }
-            tar::EntryType::Directory => {
+            tokio_tar::EntryType::Directory => {
                 debug!("directory {:?}", file_path);
             }
             _ => {
@@ -349,9 +365,9 @@ pub fn import_basebackup_from_tar<Reader: Read>(
     Ok(())
 }
 
-pub fn import_wal_from_tar<Reader: Read>(
+pub async fn import_wal_from_tar(
     tline: &Timeline,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     start_lsn: Lsn,
     end_lsn: Lsn,
 ) -> Result<()> {
@@ -360,20 +376,23 @@ pub fn import_wal_from_tar<Reader: Read>(
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn)?;
+    let mut walingest = WalIngest::new(tline, start_lsn).await?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
-    let mut pg_wal_tar = tar::Archive::new(reader);
-    let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
+    let mut pg_wal_tar = Archive::new(reader);
+    let mut pg_wal_entries = pg_wal_tar.entries()?;
     while last_lsn <= end_lsn {
         let bytes = {
-            let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
+            let mut entry = pg_wal_entries
+                .next()
+                .await
+                .ok_or_else(|| anyhow::anyhow!("expected more wal"))??;
             let header = entry.header();
             let file_path = header.path()?.into_owned();
 
             match header.entry_type() {
-                tar::EntryType::Regular => {
+                tokio_tar::EntryType::Regular => {
                     // FIXME: assume postgresql tli 1 for now
                     let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
                     let file_name = file_path
@@ -383,9 +402,9 @@ pub fn import_wal_from_tar<Reader: Read>(
                     ensure!(expected_filename == file_name);
 
                     debug!("processing wal file {:?}", file_path);
-                    read_all_bytes(entry)?
+                    read_all_bytes(&mut entry).await?
                 }
-                tar::EntryType::Directory => {
+                tokio_tar::EntryType::Directory => {
                     debug!("directory {:?}", file_path);
                     continue;
                 }
@@ -405,7 +424,9 @@ pub fn import_wal_from_tar<Reader: Read>(
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .await?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
@@ -424,7 +445,7 @@ pub fn import_wal_from_tar<Reader: Read>(
     }
 
     // Log any extra unused files
-    for e in &mut pg_wal_entries_iter {
+    while let Some(e) = pg_wal_entries.next().await {
         let entry = e?;
         let header = entry.header();
         let file_path = header.path()?.into_owned();
@@ -434,24 +455,30 @@ pub fn import_wal_from_tar<Reader: Read>(
     Ok(())
 }
 
-fn import_file<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_file(
+    modification: &mut DatadirModification<'_>,
     file_path: &Path,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> Result<Option<ControlFileData>> {
+    let file_name = match file_path.file_name() {
+        Some(name) => name.to_string_lossy(),
+        None => return Ok(None),
+    };
+
+    if file_name.starts_with('.') {
+        // tar archives on macOs, created without COPYFILE_DISABLE=1 env var
+        // will contain "fork files", skip them.
+        return Ok(None);
+    }
+
     if file_path.starts_with("global") {
         let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
         let dbnode = 0;
 
-        match file_path
-            .file_name()
-            .expect("missing filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
             "pg_control" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
 
                 // Extract the checkpoint record and import it separately.
                 let pg_control = ControlFileData::decode(&bytes[..])?;
@@ -464,7 +491,7 @@ fn import_file<Reader: Read>(
                 return Ok(Some(pg_control));
             }
             "pg_filenode.map" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
                 debug!("imported relmap file")
             }
@@ -472,7 +499,7 @@ fn import_file<Reader: Read>(
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
                 debug!("imported rel creation");
             }
         }
@@ -485,14 +512,9 @@ fn import_file<Reader: Read>(
             .to_string_lossy()
             .parse()?;
 
-        match file_path
-            .file_name()
-            .expect("missing base filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
             "pg_filenode.map" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
                 debug!("imported relmap file")
             }
@@ -500,40 +522,36 @@ fn import_file<Reader: Read>(
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
                 debug!("imported rel creation");
             }
         }
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported clog slru");
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported multixact offsets slru");
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
-        let file_name = &file_path
-            .file_name()
-            .expect("missing twophase filename")
-            .to_string_lossy();
-        let xid = u32::from_str_radix(file_name, 16)?;
+        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
-        let bytes = read_all_bytes(reader)?;
+        let bytes = read_all_bytes(reader).await?;
         modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
         debug!("found wal file in base section. ignore it");
     } else if file_path.starts_with("zenith.signal") {
         // Parse zenith signal file to set correct previous LSN
-        let bytes = read_all_bytes(reader)?;
+        let bytes = read_all_bytes(reader).await?;
         // zenith.signal format is "PREV LSN: prev_lsn"
         // TODO write serialization and deserialization in the same place.
         let zenith_signal = std::str::from_utf8(&bytes)?.trim();
@@ -570,8 +588,8 @@ fn import_file<Reader: Read>(
     Ok(None)
 }
 
-fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
+async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
     let mut buf: Vec<u8> = vec![];
-    reader.read_to_end(&mut buf)?;
+    reader.read_to_end(&mut buf).await?;
     Ok(Bytes::copy_from_slice(&buf[..]))
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index eafcaa88d9..91cde477ad 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,6 +1,7 @@
 mod auth;
 pub mod basebackup;
 pub mod config;
+pub mod consumption_metrics;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
@@ -8,15 +9,9 @@ pub(crate) mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
-pub mod profiling;
 pub mod repository;
-pub mod storage_sync2;
-pub use storage_sync2 as storage_sync;
 pub mod task_mgr;
 pub mod tenant;
-pub mod tenant_config;
-pub mod tenant_mgr;
-pub mod tenant_tasks;
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
@@ -26,9 +21,8 @@ pub mod walredo;
 
 use std::path::Path;
 
-use tracing::info;
-
 use crate::task_mgr::TaskKind;
+use tracing::info;
 
 /// Current storage format version
 ///
@@ -47,15 +41,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
 
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
-/// Config for the Repository checkpointer
-#[derive(Debug, Clone, Copy)]
-pub enum CheckpointConfig {
-    // Flush all in-memory data
-    Flush,
-    // Flush all in-memory data and reconstruct all page images
-    Forced,
-}
-
 pub async fn shutdown_pageserver(exit_code: i32) {
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
@@ -66,7 +51,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
 
     // Shut down all the tenants. This flushes everything to disk and kills
     // the checkpoint and GC tasks.
-    tenant_mgr::shutdown_all_tenants().await;
+    tenant::mgr::shutdown_all_tenants().await;
 
     // Stop syncing with remote storage.
     //
@@ -99,7 +84,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
     }
 }
 
-fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
     if n == 0 {
         0.0
     } else {
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 454ff01f0e..b61e64048b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,13 +84,20 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-// Metrics for determining timeline's physical size.
-// A layered timeline's physical is defined as the total size of
-// (delta/image) layer files on disk.
-static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
-        "pageserver_current_physical_size",
-        "Current physical size grouped by timeline",
+        "pageserver_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_remote_physical_size",
+        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
+        // Corollary: If any files are missing from the index part, they won't be included here.
         &["tenant_id", "timeline_id"]
     )
     .expect("failed to define a metric")
@@ -136,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
     1.0,      // 1 sec
 ];
 
-const STORAGE_IO_TIME_OPERATIONS: &[&str] =
-    &["open", "close", "read", "write", "seek", "fsync", "gc"];
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];
 
 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
 
@@ -201,23 +209,42 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
 
 // remote storage metrics
 
-pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
+/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
+static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
-        "pageserver_remote_upload_queue_unfinished_tasks",
-        "Number of tasks in the upload queue that are not finished yet.",
+        "pageserver_remote_timeline_client_calls_unfinished",
+        "Number of ongoing calls to remote timeline client. \
+         Used to populate pageserver_remote_timeline_client_calls_started. \
+         This metric is not useful for sampling from Prometheus, but useful in tests.",
         &["tenant_id", "timeline_id", "file_kind", "op_kind"],
     )
     .expect("failed to define a metric")
 });
 
-#[derive(Debug, Clone, Copy)]
+static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_remote_timeline_client_calls_started",
+        "When calling a remote timeline client method, we record the current value \
+         of the calls_unfinished gauge in this histogram. Plot the histogram \
+         over time in a heatmap to visualize how many operations were ongoing \
+         at a given instant. It gives you a better idea of the queue depth \
+         than plotting the gauge directly, since operations may complete faster \
+         than the sampling interval.",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
+        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
     Download,
     Delete,
 }
 impl RemoteOpKind {
-    pub fn as_str(&self) -> &str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             Self::Upload => "upload",
             Self::Download => "download",
@@ -226,13 +253,13 @@ impl RemoteOpKind {
     }
 }
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
 pub enum RemoteOpFileKind {
     Layer,
     Index,
 }
 impl RemoteOpFileKind {
-    pub fn as_str(&self) -> &str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             Self::Layer => "layer",
             Self::Index => "index",
@@ -240,15 +267,12 @@ impl RemoteOpFileKind {
     }
 }
 
-pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
-pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"];
-pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"];
-
 pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_remote_operation_seconds",
         "Time spent on remote storage operations. \
-        Grouped by tenant, timeline, operation_kind and status",
+        Grouped by tenant, timeline, operation_kind and status. \
+        Does not account for time spent waiting in remote timeline client's queues.",
         &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
     )
     .expect("failed to define a metric")
@@ -365,7 +389,7 @@ pub struct TimelineMetrics {
     pub load_layer_map_histo: Histogram,
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
-    pub current_physical_size_gauge: UIntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
@@ -406,7 +430,7 @@ impl TimelineMetrics {
         let wait_lsn_time_histo = WAIT_LSN_TIME
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
+        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
@@ -432,7 +456,7 @@ impl TimelineMetrics {
             load_layer_map_histo,
             last_record_gauge,
             wait_lsn_time_histo,
-            current_physical_size_gauge,
+            resident_physical_size_gauge,
             current_logical_size_gauge,
             num_persistent_files_created,
             persistent_bytes_written,
@@ -448,7 +472,7 @@ impl Drop for TimelineMetrics {
         let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
         let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -467,21 +491,6 @@ impl Drop for TimelineMetrics {
         for op in SMGR_QUERY_TIME_OPERATIONS {
             let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
         }
-
-        let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]);
-        for file_kind in REMOTE_OPERATION_FILE_KINDS {
-            for op in REMOTE_OPERATION_KINDS {
-                for status in REMOTE_OPERATION_STATUSES {
-                    let _ = REMOTE_OPERATION_TIME.remove_label_values(&[
-                        tenant_id,
-                        timeline_id,
-                        file_kind,
-                        op,
-                        status,
-                    ]);
-                }
-            }
-        }
     }
 }
 
@@ -491,10 +500,198 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
 
 use futures::Future;
 use pin_project_lite::pin_project;
+use std::collections::HashMap;
 use std::pin::Pin;
+use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::Instant;
 
+pub struct RemoteTimelineClientMetrics {
+    tenant_id: String,
+    timeline_id: String,
+    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
+    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
+}
+
+impl RemoteTimelineClientMetrics {
+    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+        RemoteTimelineClientMetrics {
+            tenant_id: tenant_id.to_string(),
+            timeline_id: timeline_id.to_string(),
+            remote_operation_time: Mutex::new(HashMap::default()),
+            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls_started_hist: Mutex::new(HashMap::default()),
+            remote_physical_size_gauge: Mutex::new(None),
+        }
+    }
+    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard
+            .get_or_insert_with(|| {
+                REMOTE_PHYSICAL_SIZE
+                    .get_metric_with_label_values(&[
+                        &self.tenant_id.to_string(),
+                        &self.timeline_id.to_string(),
+                    ])
+                    .unwrap()
+            })
+            .clone()
+    }
+    pub fn remote_operation_time(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+        status: &'static str,
+    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.remote_operation_time.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str(), status);
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_OPERATION_TIME
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                    key.2,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+    fn calls_unfinished_gauge(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntGauge {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+
+    fn calls_started_hist(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.calls_started_hist.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+}
+
+/// See [`RemoteTimelineClientMetrics::call_begin`].
+#[must_use]
+pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
+
+impl RemoteTimelineClientCallMetricGuard {
+    /// Consume this guard object without decrementing the metric.
+    /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
+    pub fn will_decrement_manually(mut self) {
+        self.0 = None; // prevent drop() from decrementing
+    }
+}
+
+impl Drop for RemoteTimelineClientCallMetricGuard {
+    fn drop(&mut self) {
+        if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
+            guard.dec();
+        }
+    }
+}
+
+impl RemoteTimelineClientMetrics {
+    /// Increment the metrics that track ongoing calls to the remote timeline client instance.
+    ///
+    /// Drop the returned guard object once the operation is finished to decrement the values.
+    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
+    /// is more suitable.
+    /// Never do both.
+    pub(crate) fn call_begin(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> RemoteTimelineClientCallMetricGuard {
+        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        self.calls_started_hist(file_kind, op_kind)
+            .observe(unfinished_metric.get() as f64);
+        unfinished_metric.inc();
+        RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
+    }
+
+    /// Manually decrement the metric instead of using the guard object.
+    /// Using the guard object is generally preferable.
+    /// See [`call_begin`] for more context.
+    pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
+        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        debug_assert!(
+            unfinished_metric.get() > 0,
+            "begin and end should cancel out"
+        );
+        unfinished_metric.dec();
+    }
+}
+
+impl Drop for RemoteTimelineClientMetrics {
+    fn drop(&mut self) {
+        let RemoteTimelineClientMetrics {
+            tenant_id,
+            timeline_id,
+            remote_physical_size_gauge,
+            remote_operation_time,
+            calls_unfinished_gauge,
+            calls_started_hist,
+        } = self;
+        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
+            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
+        }
+        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        {
+            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
+            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        }
+    }
+}
+
 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
 pub trait MeasureRemoteOp: Sized {
@@ -504,6 +701,7 @@ pub trait MeasureRemoteOp: Sized {
         timeline_id: TimelineId,
         file_kind: RemoteOpFileKind,
         op: RemoteOpKind,
+        metrics: Arc<RemoteTimelineClientMetrics>,
     ) -> MeasuredRemoteOp<Self> {
         let start = Instant::now();
         MeasuredRemoteOp {
@@ -513,6 +711,7 @@ pub trait MeasureRemoteOp: Sized {
             file_kind,
             op,
             start,
+            metrics,
         }
     }
 }
@@ -529,6 +728,7 @@ pin_project! {
         file_kind: RemoteOpFileKind,
         op: RemoteOpKind,
         start: Instant,
+        metrics: Arc<RemoteTimelineClientMetrics>,
     }
 }
 
@@ -541,15 +741,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
         if let Poll::Ready(ref res) = poll_result {
             let duration = this.start.elapsed();
             let status = if res.is_ok() { &"success" } else { &"failure" };
-            REMOTE_OPERATION_TIME
-                .get_metric_with_label_values(&[
-                    &this.tenant_id.to_string(),
-                    &this.timeline_id.to_string(),
-                    this.file_kind.as_str(),
-                    this.op.as_str(),
-                    status,
-                ])
-                .unwrap()
+            this.metrics
+                .remote_operation_time(this.file_kind, this.op, status)
                 .observe(duration.as_secs_f64());
         }
         poll_result
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 036fb14e9b..b266a07337 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -9,7 +9,7 @@
 //  custom protocol.
 //
 
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
@@ -19,6 +19,8 @@ use pageserver_api::models::{
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
+use pq_proto::ConnectionError;
+use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
 use std::net::TcpListener;
@@ -26,11 +28,9 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::pin;
-use tokio_util::io::StreamReader;
-use tokio_util::io::SyncIoBridge;
 use tracing::*;
 use utils::id::ConnectionId;
+use utils::postgres_backend_async::QueryError;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
@@ -42,16 +42,14 @@ use utils::{
 
 use crate::auth::check_permission;
 use crate::basebackup;
-use crate::config::{PageServerConf, ProfilingConfig};
+use crate::config::PageServerConf;
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
-use crate::profiling::profpoint_start;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, Timeline};
-use crate::tenant_mgr;
 use crate::trace::Tracer;
-use crate::CheckpointConfig;
 
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
@@ -65,8 +63,8 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                 _ = task_mgr::shutdown_watcher() => {
                     // We were requested to shut down.
                     let msg = format!("pageserver is shutting down");
-                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg));
-                    Err(anyhow::anyhow!(msg))
+                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
+                    Err(QueryError::Other(anyhow::anyhow!(msg)))
                 }
 
                 msg = pgb.read_message() => { msg }
@@ -79,14 +77,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                         FeMessage::CopyDone => { break },
                         FeMessage::Sync => continue,
                         FeMessage::Terminate => {
-                            let msg = format!("client terminated connection with Terminate message during COPY");
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            let msg = "client terminated connection with Terminate message during COPY";
+                            let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                            pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
                             Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                             break;
                         }
                         m => {
-                            let msg = format!("unexpected message {:?}", m);
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            let msg = format!("unexpected message {m:?}");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
                             Err(io::Error::new(io::ErrorKind::Other, msg))?;
                             break;
                         }
@@ -96,12 +95,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                 }
                 Ok(None) => {
                     let msg = "client closed connection during COPY";
-                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
+                    let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                    pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
                     pgb.flush().await?;
                     Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                 }
-                Err(e) => {
-                    Err(io::Error::new(io::ErrorKind::Other, e))?;
+                Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
+                    Err(io_error)?;
+                }
+                Err(other) => {
+                    Err(io::Error::new(io::ErrorKind::Other, other))?;
                 }
             };
         }
@@ -199,23 +202,19 @@ async fn page_service_conn_main(
             // we've been requested to shut down
             Ok(())
         }
-        Err(err) => {
-            let root_cause_io_err_kind = err
-                .root_cause()
-                .downcast_ref::<io::Error>()
-                .map(|e| e.kind());
-
+        Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
             // `ConnectionReset` error happens when the Postgres client closes the connection.
             // As this disconnection happens quite often and is expected,
             // we decided to downgrade the logging level to `INFO`.
             // See: https://github.com/neondatabase/neon/issues/1683.
-            if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
+            if io_error.kind() == io::ErrorKind::ConnectionReset {
                 info!("Postgres client disconnected");
                 Ok(())
             } else {
-                Err(err)
+                Err(io_error).context("Postgres connection error")
             }
         }
+        other => other.context("Postgres query error"),
     }
 }
 
@@ -254,7 +253,7 @@ impl PageRequestMetrics {
 
 #[derive(Debug)]
 struct PageServerHandler {
-    conf: &'static PageServerConf,
+    _conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 }
@@ -262,7 +261,7 @@ struct PageServerHandler {
 impl PageServerHandler {
     pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
         PageServerHandler {
-            conf,
+            _conf: conf,
             auth,
             claims: None,
         }
@@ -317,7 +316,7 @@ impl PageServerHandler {
                 Some(FeMessage::CopyData(bytes)) => bytes,
                 Some(FeMessage::Terminate) => break,
                 Some(m) => {
-                    bail!("unexpected message: {m:?} during COPY");
+                    anyhow::bail!("unexpected message: {m:?} during COPY");
                 }
                 None => break, // client disconnected
             };
@@ -374,7 +373,7 @@ impl PageServerHandler {
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
@@ -396,9 +395,7 @@ impl PageServerHandler {
         pgb.write_message(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
 
-        let copyin_stream = copyin_stream(pgb);
-        pin!(copyin_stream);
-
+        let mut copyin_stream = Box::pin(copyin_stream(pgb));
         timeline
             .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
             .await?;
@@ -430,11 +427,16 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
-        ensure!(timeline.get_last_record_lsn() == start_lsn);
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
 
         // TODO leave clean state on error. For now you can use detach to clean
         // up broken state from a failed import.
@@ -444,10 +446,8 @@ impl PageServerHandler {
         pgb.write_message(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| {
-            import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)
-        })?;
+        let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
+        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -460,13 +460,17 @@ impl PageServerHandler {
         }
 
         // TODO Does it make sense to overshoot?
-        ensure!(timeline.get_last_record_lsn() >= end_lsn);
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
 
         // Flush data to disk, then upload to s3. No need for a forced checkpoint.
         // We only want to persist the data, and it doesn't matter if it's in the
         // shape of deltas or images.
         info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush).await?;
+        timeline.freeze_and_flush().await?;
 
         info!("done");
         Ok(())
@@ -489,7 +493,7 @@ impl PageServerHandler {
         mut lsn: Lsn,
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
-    ) -> Result<Lsn> {
+    ) -> anyhow::Result<Lsn> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
             // to the page server that there have been no modifications to the
@@ -520,11 +524,11 @@ impl PageServerHandler {
             }
         } else {
             if lsn == Lsn(0) {
-                bail!("invalid LSN(0) in request");
+                anyhow::bail!("invalid LSN(0) in request");
             }
             timeline.wait_lsn(lsn).await?;
         }
-        ensure!(
+        anyhow::ensure!(
             lsn >= **latest_gc_cutoff_lsn,
             "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
             lsn, **latest_gc_cutoff_lsn
@@ -537,12 +541,15 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?;
+        let exists = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_exists(req.rel, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
             exists,
@@ -554,12 +561,15 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?;
+        let n_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_size(req.rel, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
@@ -571,14 +581,15 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let total_blocks =
-            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?;
-
+        let total_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+        })
+        .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
         Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
@@ -591,7 +602,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -604,11 +615,10 @@ impl PageServerHandler {
         }
         */
 
-        // FIXME: this profiling now happens at different place than it used to. The
-        // current profiling is based on a thread-local variable, so it doesn't work
-        // across awaits
-        let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
-        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
+        let page = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
             page,
@@ -642,16 +652,12 @@ impl PageServerHandler {
         pgb.flush().await?;
 
         /* Send a tarball of the latest layer on the timeline */
-        let mut writer = CopyDataSink {
-            pgb,
-            rt: tokio::runtime::Handle::current(),
-        };
-        tokio::task::block_in_place(|| {
-            let basebackup =
-                basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
-            tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str());
-            basebackup.send_tarball()
-        })?;
+        {
+            let mut writer = pgb.copyout_writer();
+            basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
+                .await?;
+        }
+
         pgb.write_message(&BeMessage::CopyDone)?;
         pgb.flush().await?;
         info!("basebackup complete");
@@ -661,7 +667,7 @@ impl PageServerHandler {
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
         if self.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
@@ -683,20 +689,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
         &mut self,
         _pgb: &mut PostgresBackend,
         jwt_response: &[u8],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
             .auth
             .as_ref()
             .unwrap()
-            .decode(str::from_utf8(jwt_response)?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
 
-        if matches!(data.claims.scope, Scope::Tenant) {
-            ensure!(
-                data.claims.tenant_id.is_some(),
+        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
+            return Err(QueryError::Other(anyhow::anyhow!(
                 "jwt token scope is Tenant, but tenant id is missing"
-            )
+            )));
         }
 
         info!(
@@ -708,22 +713,33 @@ impl postgres_backend_async::Handler for PageServerHandler {
         Ok(())
     }
 
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        Ok(())
+    }
+
     async fn process_query(
         &mut self,
         pgb: &mut PostgresBackend,
         query_string: &str,
-    ) -> anyhow::Result<()> {
-        debug!("process query {:?}", query_string);
+    ) -> Result<(), QueryError> {
+        debug!("process query {query_string:?}");
 
         if query_string.starts_with("pagestream ") {
             let (_, params_raw) = query_string.split_at("pagestream ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
-            ensure!(
-                params.len() == 2,
-                "invalid param number for pagestream command"
-            );
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -733,18 +749,24 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() >= 2,
-                "invalid param number for basebackup command"
-            );
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for basebackup command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
 
             let lsn = if params.len() == 3 {
-                Some(Lsn::from_str(params[2])?)
+                Some(
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                )
             } else {
                 None
             };
@@ -759,13 +781,16 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() == 2,
-                "invalid param number for get_last_record_rlsn command"
-            );
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for get_last_record_rlsn command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
             let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
@@ -787,22 +812,31 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("fullbackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() >= 2,
-                "invalid param number for fullbackup command"
-            );
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for fullbackup command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             // The caller is responsible for providing correct lsn and prev_lsn.
             let lsn = if params.len() > 2 {
-                Some(Lsn::from_str(params[2])?)
+                Some(
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                )
             } else {
                 None
             };
             let prev_lsn = if params.len() > 3 {
-                Some(Lsn::from_str(params[3])?)
+                Some(
+                    Lsn::from_str(params[3])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
+                )
             } else {
                 None
             };
@@ -827,12 +861,21 @@ impl postgres_backend_async::Handler for PageServerHandler {
             //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
             let (_, params_raw) = query_string.split_at("import basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
-            ensure!(params.len() == 5);
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
-            let base_lsn = Lsn::from_str(params[2])?;
-            let end_lsn = Lsn::from_str(params[3])?;
-            let pg_version = u32::from_str(params[4])?;
+            if params.len() != 5 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import basebackup command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let base_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+            let pg_version = u32::from_str(params[4])
+                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -850,7 +893,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
+                    pgb.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
                 }
             };
         } else if query_string.starts_with("import wal ") {
@@ -860,11 +906,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // caller should poll the http api to check when that is done.
             let (_, params_raw) = query_string.split_at("import wal ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
-            ensure!(params.len() == 4);
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
-            let start_lsn = Lsn::from_str(params[2])?;
-            let end_lsn = Lsn::from_str(params[3])?;
+            if params.len() != 4 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import wal command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let start_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -875,7 +929,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
+                    pgb.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
                 }
             };
         } else if query_string.to_ascii_lowercase().starts_with("set ") {
@@ -886,8 +943,13 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // show <tenant_id>
             let (_, params_raw) = query_string.split_at("show ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
-            ensure!(params.len() == 1, "invalid param number for config command");
-            let tenant_id = TenantId::from_str(params[0])?;
+            if params.len() != 1 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for config command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -928,7 +990,9 @@ impl postgres_backend_async::Handler for PageServerHandler {
             ]))?
             .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else {
-            bail!("unknown command");
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "unknown command {query_string}"
+            )));
         }
 
         Ok(())
@@ -940,8 +1004,8 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
-    let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
+    let tenant = mgr::get_tenant(tenant_id, false).await?;
     match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
         Ok(wait_result) => wait_result
             // no .context(), the error message is good enough and some tests depend on it
@@ -954,37 +1018,8 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenan
 async fn get_active_timeline_with_timeout(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-) -> Result<Arc<Timeline>> {
+) -> anyhow::Result<Arc<Timeline>> {
     get_active_tenant_with_timeout(tenant_id)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
 }
-
-///
-/// A std::io::Write implementation that wraps all data written to it in CopyData
-/// messages.
-///
-struct CopyDataSink<'a> {
-    pgb: &'a mut PostgresBackend,
-    rt: tokio::runtime::Handle,
-}
-
-impl<'a> io::Write for CopyDataSink<'a> {
-    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
-        // CopyData
-        // FIXME: if the input is large, we should split it into multiple messages.
-        // Not sure what the threshold should be, but the ultimate hard limit is that
-        // the length cannot exceed u32.
-        // FIXME: flush isn't really required, but makes it easier
-        // to view in wireshark
-        self.pgb.write_message(&BeMessage::CopyData(data))?;
-        self.rt.block_on(self.pgb.flush())?;
-        trace!("CopyData sent for {} bytes!", data.len());
-
-        Ok(data.len())
-    }
-    fn flush(&mut self) -> io::Result<()> {
-        // no-op
-        Ok(())
-    }
-}
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0e334a63df..82b1576145 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,11 +6,12 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
+use super::tenant::PageReconstructResult;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::repository::*;
-use crate::tenant::Timeline;
+use crate::tenant::{with_ondemand_download, Timeline};
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, ensure, Result};
+use crate::{repository::*, try_no_ondemand_download};
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -19,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::Range;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -33,6 +35,14 @@ pub enum LsnForTimestamp {
     NoData(Lsn),
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum CalculateLogicalSizeError {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -88,16 +98,18 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
-    ) -> Result<Bytes> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    ) -> PageReconstructResult<Bytes> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest)?;
+        let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
                 tag, blknum, lsn, nblocks
             );
-            return Ok(ZERO_PAGE.clone());
+            return PageReconstructResult::Success(ZERO_PAGE.clone());
         }
 
         let key = rel_block_to_key(tag, blknum);
@@ -105,38 +117,51 @@ impl Timeline {
     }
 
     // Get size of a database in blocks
-    pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
+    pub fn get_db_size(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+        latest: bool,
+    ) -> PageReconstructResult<usize> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn)?;
+        let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest)?;
+            let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
             total_blocks += n_blocks as usize;
         }
-        Ok(total_blocks)
+        PageReconstructResult::Success(total_blocks)
     }
 
     /// Get size of a relation file
-    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_size(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        latest: bool,
+    ) -> PageReconstructResult<BlockNumber> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
         if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(nblocks);
+            return PageReconstructResult::Success(nblocks);
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest)?
+            && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
             // without extending it.  Tolerate that by claiming that
             // any non-existent FSM fork has size 0.
-            return Ok(0);
+            return PageReconstructResult::Success(0);
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn)?;
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -149,43 +174,62 @@ impl Timeline {
             // associated with most recent value of LSN.
             self.update_cached_rel_size(tag, lsn, nblocks);
         }
-        Ok(nblocks)
+        PageReconstructResult::Success(nblocks)
     }
 
     /// Does relation exist?
-    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_exists(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        _latest: bool,
+    ) -> PageReconstructResult<bool> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
         // first try to lookup relation in cache
         if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(true);
+            return PageReconstructResult::Success(true);
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
-
-        Ok(exists)
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Get a list of all existing relations in given tablespace and database.
-    pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
+    pub fn list_rels(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> PageReconstructResult<HashSet<RelTag>> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let rels: HashSet<RelTag> =
-            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
-                spcnode,
-                dbnode,
-                relnode: *relnode,
-                forknum: *forknum,
-            }));
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let rels: HashSet<RelTag> =
+                    HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
+                        spcnode,
+                        dbnode,
+                        relnode: *relnode,
+                        forknum: *forknum,
+                    }));
 
-        Ok(rels)
+                PageReconstructResult::Success(rels)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Look up given SLRU page version.
@@ -195,7 +239,7 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
-    ) -> Result<Bytes> {
+    ) -> PageReconstructResult<Bytes> {
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn)
     }
@@ -206,21 +250,30 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
-    ) -> Result<BlockNumber> {
+    ) -> PageReconstructResult<BlockNumber> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn)?;
-        Ok(buf.get_u32_le())
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf.get_u32_le())
     }
 
     /// Get size of an SLRU segment
-    pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
+    pub fn get_slru_segment_exists(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+    ) -> PageReconstructResult<bool> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let exists = dir.segments.get(&segno).is_some();
-        Ok(exists)
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.segments.get(&segno).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Locate LSN, such that all transactions that committed before
@@ -230,7 +283,10 @@ impl Timeline {
     /// so it's not well defined which LSN you get if there were multiple commits
     /// "in flight" at that point in time.
     ///
-    pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
+    pub fn find_lsn_for_timestamp(
+        &self,
+        search_timestamp: TimestampTz,
+    ) -> PageReconstructResult<LsnForTimestamp> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
         let max_lsn = self.get_last_record_lsn();
@@ -246,12 +302,12 @@ impl Timeline {
             // cannot overflow, high and low are both smaller than u64::MAX / 2
             let mid = (high + low) / 2;
 
-            let cmp = self.is_latest_commit_timestamp_ge_than(
+            let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
                 search_timestamp,
                 Lsn(mid * 8),
                 &mut found_smaller,
                 &mut found_larger,
-            )?;
+            ));
 
             if cmp {
                 high = mid;
@@ -263,15 +319,15 @@ impl Timeline {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.
                 // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
             }
             (true, false) => {
                 // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
             }
             (false, true) => {
                 // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
             }
             (true, true) => {
                 // low is the LSN of the first commit record *after* the search_timestamp,
@@ -281,7 +337,7 @@ impl Timeline {
                 // Otherwise, if you restore to the returned LSN, the database will
                 // include physical changes from later commits that will be marked
                 // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+                PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
             }
         }
     }
@@ -299,12 +355,20 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
-    ) -> Result<bool> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
-            let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
+    ) -> PageReconstructResult<bool> {
+        for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
+            let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
+                SlruKind::Clog,
+                segno,
+                probe_lsn
+            ));
             for blknum in (0..nblocks).rev() {
-                let clog_page =
-                    self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
+                let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
+                    SlruKind::Clog,
+                    segno,
+                    blknum,
+                    probe_lsn
+                ));
 
                 if clog_page.len() == BLCKSZ as usize + 8 {
                     let mut timestamp_bytes = [0u8; 8];
@@ -313,61 +377,75 @@ impl Timeline {
 
                     if timestamp >= search_timestamp {
                         *found_larger = true;
-                        return Ok(true);
+                        return PageReconstructResult::Success(true);
                     } else {
                         *found_smaller = true;
                     }
                 }
             }
         }
-        Ok(false)
+        PageReconstructResult::Success(false)
     }
 
     /// Get a list of SLRU segments
-    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
+    pub fn list_slru_segments(
+        &self,
+        kind: SlruKind,
+        lsn: Lsn,
+    ) -> PageReconstructResult<HashSet<u32>> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
-
-        Ok(dir.segments)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.segments),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_relmap_file(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> PageReconstructResult<Bytes> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
     }
 
-    pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
+    pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
         // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dir = DbDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
 
-        Ok(dir.dbdirs)
+        match DbDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
         let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
     }
 
-    pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
+    pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
         // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
 
-        Ok(dir.xids)
+        match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.xids),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
         self.get(CONTROLFILE_KEY, lsn)
     }
 
-    pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
         self.get(CHECKPOINT_KEY, lsn)
     }
 
@@ -376,16 +454,26 @@ impl Timeline {
     ///
     /// Only relation blocks are counted currently. That excludes metadata,
     /// SLRUs, twophase files etc.
-    pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
+    pub async fn get_current_logical_size_non_incremental(
+        &self,
+        lsn: Lsn,
+        cancel: CancellationToken,
+    ) -> Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
+            for rel in
+                crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
+                    .await?
+            {
+                if cancel.is_cancelled() {
+                    return Err(CalculateLogicalSizeError::Cancelled);
+                }
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                 let relsize = buf.get_u32_le();
 
                 total_size += relsize as u64;
@@ -398,7 +486,7 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
+    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -406,8 +494,8 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
+        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
         dbs.sort_unstable();
@@ -415,15 +503,15 @@ impl Timeline {
             result.add_key(relmap_file_key(spcnode, dbnode));
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
-            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)?
-                .iter()
-                .cloned()
-                .collect();
+            let mut rels: Vec<RelTag> =
+                with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
+                    .await?
+                    .into_iter()
+                    .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                 let relsize = buf.get_u32_le();
 
                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -439,13 +527,13 @@ impl Timeline {
         ] {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn)?;
-            let dir = SlruSegmentDirectory::des(&buf)?;
+            let buf = self.get_download(slrudir_key, lsn).await?;
+            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
                 let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn)?;
+                let mut buf = self.get_download(segsize_key, lsn).await?;
                 let segsize = buf.get_u32_le();
 
                 result.add_range(
@@ -457,8 +545,8 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
         for xid in xids {
@@ -537,7 +625,7 @@ impl<'a> DatadirModification<'a> {
     ///
     /// This inserts the directory metadata entries that are assumed to
     /// always exist.
-    pub fn init_empty(&mut self) -> Result<()> {
+    pub fn init_empty(&mut self) -> anyhow::Result<()> {
         let buf = DbDirectory::ser(&DbDirectory {
             dbdirs: HashMap::new(),
         })?;
@@ -570,8 +658,8 @@ impl<'a> DatadirModification<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
     }
@@ -583,7 +671,7 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         self.put(
             slru_block_to_key(kind, segno, blknum),
             Value::WalRecord(rec),
@@ -597,8 +685,8 @@ impl<'a> DatadirModification<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
     }
@@ -609,26 +697,26 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
         Ok(())
     }
 
     /// Store a relmapper file (pg_filenode.map) in the repository
-    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
+    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
-        if r == None || r == Some(false) {
+        if r.is_none() || r == Some(false) {
             // The dbdir entry didn't exist, or it contained a
             // 'false'. The 'insert' call already updated it with
             // 'true', now write the updated 'dbdirs' map back.
             let buf = DbDirectory::ser(&dbdir)?;
             self.put(DBDIR_KEY, Value::Image(buf.into()));
         }
-        if r == None {
+        if r.is_none() {
             // Create RelDirectory
             let buf = RelDirectory::ser(&RelDirectory {
                 rels: HashSet::new(),
@@ -643,12 +731,12 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> {
+    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
         if !dir.xids.insert(xid) {
-            bail!("twophase file for xid {} already exists", xid);
+            anyhow::bail!("twophase file for xid {} already exists", xid);
         }
         self.put(
             TWOPHASEDIR_KEY,
@@ -659,23 +747,26 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn put_control_file(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CHECKPOINT_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
+    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
-        let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
+        let total_blocks = self
+            .tline
+            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .no_ondemand_download()?;
 
         // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
@@ -698,11 +789,11 @@ impl<'a> DatadirModification<'a> {
     /// Create a relation fork.
     ///
     /// 'nblocks' is the initial size.
-    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
         let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
             // Didn't exist. Update dbdir
@@ -714,12 +805,12 @@ impl<'a> DatadirModification<'a> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key)?)?
+            RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
         };
 
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            bail!("rel {} already exists", rel);
+            anyhow::bail!("rel {rel} already exists");
         }
         self.put(
             rel_dir_key,
@@ -742,13 +833,17 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Truncate relation
-    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true)? {
+        if self
+            .tline
+            .get_rel_exists(rel, last_lsn, true)
+            .no_ondemand_download()?
+        {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
-            let old_size = self.get(size_key)?.get_u32_le();
+            let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
 
             // Update the entry with the new size.
             let buf = nblocks.to_le_bytes();
@@ -768,12 +863,12 @@ impl<'a> DatadirModification<'a> {
 
     /// Extend relation
     /// If new size is smaller, do nothing.
-    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
 
         // only extend relation here. never decrease the size
         if nblocks > old_size {
@@ -789,12 +884,12 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relation.
-    pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+    pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
         let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = RelDirectory::des(&buf)?;
 
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -805,7 +900,7 @@ impl<'a> DatadirModification<'a> {
 
         // update logical size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
         self.pending_nblocks -= old_size as i64;
 
         // Remove enty from relation size cache
@@ -822,14 +917,14 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
-            bail!("slru segment {:?}/{} already exists", kind, segno);
+            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
         }
         self.put(
             dir_key,
@@ -852,7 +947,7 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Put size
         let size_key = slru_segment_size_to_key(kind, segno);
         let buf = nblocks.to_le_bytes();
@@ -861,10 +956,10 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> {
+    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.remove(&segno) {
@@ -882,15 +977,15 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
         // TODO
         Ok(())
     }
 
     /// This method is used for marking truncated SLRU files
-    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> {
+    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
         // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
 
         if !dir.xids.remove(&xid) {
@@ -925,7 +1020,7 @@ impl<'a> DatadirModification<'a> {
     /// retains all the metadata, but data pages are flushed. That's again OK
     /// for bulk import, where you are just loading data pages and won't try to
     /// modify the same pages twice.
-    pub fn flush(&mut self) -> Result<()> {
+    pub fn flush(&mut self) -> anyhow::Result<()> {
         // Unless we have accumulated a decent amount of changes, it's not worth it
         // to scan through the pending_updates list.
         let pending_nblocks = self.pending_nblocks;
@@ -936,7 +1031,7 @@ impl<'a> DatadirModification<'a> {
         let writer = self.tline.writer();
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: Result<()> = Ok(());
+        let mut result: anyhow::Result<()> = Ok(());
         self.pending_updates.retain(|&key, value| {
             if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
                 result = writer.put(key, self.lsn, value);
@@ -984,7 +1079,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    fn get(&self, key: Key) -> Result<Bytes> {
+    fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -992,14 +1087,14 @@ impl<'a> DatadirModification<'a> {
         // value that has been removed, deletion only avoids leaking storage.
         if let Some(value) = self.pending_updates.get(&key) {
             if let Value::Image(img) = value {
-                Ok(img.clone())
+                PageReconstructResult::Success(img.clone())
             } else {
                 // Currently, we never need to read back a WAL record that we
                 // inserted in the same "transaction". All the metadata updates
                 // work directly with Images, and we never need to read actual
                 // data pages. We could handle this if we had to, by calling
                 // the walredo manager, but let's keep it simple for now.
-                bail!("unexpected pending WAL record");
+                PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1327,7 +1422,7 @@ fn twophase_key_range(xid: TransactionId) -> Range<Key> {
         field2: 0,
         field3: 0,
         field4: 0,
-        field5: if overflowed { 1 } else { 0 },
+        field5: u8::from(overflowed),
         field6: next_xid,
     }
 }
@@ -1354,7 +1449,7 @@ const CHECKPOINT_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
-pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
             RelTag {
@@ -1365,7 +1460,7 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
             },
             key.field6,
         ),
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
 
@@ -1384,21 +1479,21 @@ pub fn is_rel_vm_block_key(key: Key) -> bool {
         && key.field6 != 0xffffffff
 }
 
-pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
     Ok(match key.field1 {
         0x01 => {
             let kind = match key.field2 {
                 0x00 => SlruKind::Clog,
                 0x01 => SlruKind::MultiXactMembers,
                 0x02 => SlruKind::MultiXactOffsets,
-                _ => bail!("unrecognized slru kind 0x{:02x}", key.field2),
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
             };
             let segno = key.field4;
             let blknum = key.field6;
 
             (kind, segno, blknum)
         }
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
 
@@ -1413,7 +1508,7 @@ pub fn create_test_timeline(
     tenant: &crate::tenant::Tenant,
     timeline_id: utils::id::TimelineId,
     pg_version: u32,
-) -> Result<std::sync::Arc<Timeline>> {
+) -> anyhow::Result<std::sync::Arc<Timeline>> {
     let tline = tenant
         .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
         .initialize()?;
diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs
deleted file mode 100644
index ad896cfa30..0000000000
--- a/pageserver/src/profiling.rs
+++ /dev/null
@@ -1,107 +0,0 @@
-//!
-//! Support for profiling
-//!
-//! This relies on a modified version of the 'pprof-rs' crate. That's not very
-//! nice, so to avoid a hard dependency on that, this is an optional feature.
-//!
-use crate::config::{PageServerConf, ProfilingConfig};
-
-/// The actual implementation is in the `profiling_impl` submodule. If the profiling
-/// feature is not enabled, it's just a dummy implementation that panics if you
-/// try to enabled profiling in the configuration.
-pub use profiling_impl::*;
-
-#[cfg(feature = "profiling")]
-mod profiling_impl {
-    use super::*;
-    use pprof;
-    use std::marker::PhantomData;
-
-    /// Start profiling the current thread. Returns a guard object;
-    /// the profiling continues until the guard is dropped.
-    ///
-    /// Note: profiling is not re-entrant. If you call 'profpoint_start' while
-    /// profiling is already started, nothing happens, and the profiling will be
-    /// stopped when either guard object is dropped.
-    #[inline]
-    pub fn profpoint_start(
-        conf: &crate::config::PageServerConf,
-        point: ProfilingConfig,
-    ) -> Option<ProfilingGuard> {
-        if conf.profiling == point {
-            pprof::start_profiling();
-            Some(ProfilingGuard(PhantomData))
-        } else {
-            None
-        }
-    }
-
-    /// A hack to remove Send and Sync from the ProfilingGuard. Because the
-    /// profiling is attached to current thread.
-    ////
-    /// See comments in https://github.com/rust-lang/rust/issues/68318
-    type PhantomUnsend = std::marker::PhantomData<*mut u8>;
-
-    pub struct ProfilingGuard(PhantomUnsend);
-
-    impl Drop for ProfilingGuard {
-        fn drop(&mut self) {
-            pprof::stop_profiling();
-        }
-    }
-
-    /// Initialize the profiler. This must be called before any 'profpoint_start' calls.
-    pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
-        if conf.profiling != ProfilingConfig::Disabled {
-            Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
-        } else {
-            None
-        }
-    }
-
-    /// Exit the profiler. Writes the flamegraph to current workdir.
-    pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
-        // Write out the flamegraph
-        if let Some(profiler_guard) = profiler_guard {
-            if let Ok(report) = profiler_guard.report().build() {
-                // this gets written under the workdir
-                let file = std::fs::File::create("flamegraph.svg").unwrap();
-                let mut options = pprof::flamegraph::Options::default();
-                options.image_width = Some(2500);
-                report.flamegraph_with_options(file, &mut options).unwrap();
-            }
-        }
-    }
-}
-
-/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
-#[cfg(not(feature = "profiling"))]
-mod profiling_impl {
-    use super::*;
-
-    pub struct DummyProfilerGuard;
-
-    impl Drop for DummyProfilerGuard {
-        fn drop(&mut self) {
-            // do nothing, this exists to calm Clippy down
-        }
-    }
-
-    pub fn profpoint_start(
-        _conf: &PageServerConf,
-        _point: ProfilingConfig,
-    ) -> Option<DummyProfilerGuard> {
-        None
-    }
-
-    pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
-        if conf.profiling != ProfilingConfig::Disabled {
-            // shouldn't happen, we don't allow profiling in the config if the support
-            // for it is disabled.
-            panic!("profiling enabled but the binary was compiled without profiling support");
-        }
-        None
-    }
-
-    pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
-}
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
deleted file mode 100644
index 0d25d88a97..0000000000
--- a/pageserver/src/storage_sync2/download.rs
+++ /dev/null
@@ -1,232 +0,0 @@
-//! Helper functions to download files from remote storage with a RemoteStorage
-use std::collections::HashSet;
-use std::path::Path;
-
-use anyhow::{bail, Context};
-use futures::stream::{FuturesUnordered, StreamExt};
-use tokio::fs;
-use tokio::io::AsyncWriteExt;
-use tracing::{debug, info_span, Instrument};
-
-use crate::config::PageServerConf;
-use crate::storage_sync::index::LayerFileMetadata;
-use crate::tenant::filename::LayerFileName;
-use remote_storage::{DownloadError, GenericRemoteStorage};
-use utils::crashsafe::path_with_suffix_extension;
-use utils::id::{TenantId, TimelineId};
-
-use super::index::{IndexPart, IndexPartUnclean};
-
-async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    fs::File::open(path).await?.sync_all().await
-}
-
-///
-/// If 'metadata' is given, we will validate that the downloaded file's size matches that
-/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
-///
-/// Returns the size of the downloaded file.
-pub async fn download_layer_file<'a>(
-    conf: &'static PageServerConf,
-    storage: &'a GenericRemoteStorage,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    layer_file_name: &'a LayerFileName,
-    layer_metadata: &'a LayerFileMetadata,
-) -> anyhow::Result<u64> {
-    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
-
-    let local_path = timeline_path.join(layer_file_name.file_name());
-
-    let remote_path = conf.remote_path(&local_path)?;
-
-    // Perform a rename inspired by durable_rename from file_utils.c.
-    // The sequence:
-    //     write(tmp)
-    //     fsync(tmp)
-    //     rename(tmp, new)
-    //     fsync(new)
-    //     fsync(parent)
-    // For more context about durable_rename check this email from postgres mailing list:
-    // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
-    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
-
-    // TODO: this doesn't use the cached fd for some reason?
-    let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
-        format!(
-            "Failed to create a destination file for layer '{}'",
-            temp_file_path.display()
-        )
-    })?;
-    let mut download = storage.download(&remote_path).await.with_context(|| {
-        format!(
-            "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
-        )
-    })?;
-    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
-        format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
-    })?;
-
-    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-    // A file will not be closed immediately when it goes out of scope if there are any IO operations
-    // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-    // you should call flush before dropping it.
-    //
-    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
-    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
-    // But for additional safety lets check/wait for any pending operations.
-    destination_file.flush().await.with_context(|| {
-        format!(
-            "failed to flush source file at {}",
-            temp_file_path.display()
-        )
-    })?;
-
-    match layer_metadata.file_size() {
-        Some(expected) if expected != bytes_amount => {
-            anyhow::bail!(
-                "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
-                temp_file_path.display()
-            );
-        }
-        Some(_) | None => {
-            // matches, or upgrading from an earlier IndexPart version
-        }
-    }
-
-    // not using sync_data because it can lose file size update
-    destination_file.sync_all().await.with_context(|| {
-        format!(
-            "failed to fsync source file at {}",
-            temp_file_path.display()
-        )
-    })?;
-    drop(destination_file);
-
-    fail::fail_point!("remote-storage-download-pre-rename", |_| {
-        bail!("remote-storage-download-pre-rename failpoint triggered")
-    });
-
-    fs::rename(&temp_file_path, &local_path).await?;
-
-    fsync_path(&local_path)
-        .await
-        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
-
-    tracing::info!("download complete: {}", local_path.display());
-
-    Ok(bytes_amount)
-}
-
-const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
-
-pub fn is_temp_download_file(path: &Path) -> bool {
-    let extension = path.extension().map(|pname| {
-        pname
-            .to_str()
-            .expect("paths passed to this function must be valid Rust strings")
-    });
-    match extension {
-        Some(TEMP_DOWNLOAD_EXTENSION) => true,
-        Some(_) => false,
-        None => false,
-    }
-}
-
-/// List timelines of given tenant in remote storage
-pub async fn list_remote_timelines<'a>(
-    storage: &'a GenericRemoteStorage,
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
-    let tenant_path = conf.timelines_path(&tenant_id);
-    let tenant_storage_path = conf.remote_path(&tenant_path)?;
-
-    let timelines = storage
-        .list_prefixes(Some(&tenant_storage_path))
-        .await
-        .with_context(|| {
-            format!(
-                "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
-            )
-        })?;
-
-    if timelines.is_empty() {
-        anyhow::bail!("no timelines found on the remote storage")
-    }
-
-    let mut timeline_ids = HashSet::new();
-    let mut part_downloads = FuturesUnordered::new();
-
-    for timeline_remote_storage_key in timelines {
-        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
-        })?;
-
-        let timeline_id: TimelineId = object_name.parse().with_context(|| {
-            format!("failed to parse object name into timeline id '{object_name}'")
-        })?;
-
-        // list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
-        // yet, launch a download task for it.
-        if !timeline_ids.contains(&timeline_id) {
-            timeline_ids.insert(timeline_id);
-            let storage_clone = storage.clone();
-            part_downloads.push(async move {
-                (
-                    timeline_id,
-                    download_index_part(conf, &storage_clone, tenant_id, timeline_id)
-                        .instrument(info_span!("download_index_part", timeline=%timeline_id))
-                        .await,
-                )
-            });
-        }
-    }
-
-    // Wait for all the download tasks to complete.
-    let mut timeline_parts = Vec::new();
-    while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
-        let index_part = part_upload_result
-            .with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
-
-        debug!("Successfully fetched index part for timeline {timeline_id}");
-        timeline_parts.push((timeline_id, index_part));
-    }
-    Ok(timeline_parts)
-}
-
-pub async fn download_index_part(
-    conf: &'static PageServerConf,
-    storage: &GenericRemoteStorage,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> Result<IndexPart, DownloadError> {
-    let index_part_path = conf
-        .metadata_path(timeline_id, tenant_id)
-        .with_file_name(IndexPart::FILE_NAME);
-    let part_storage_path = conf
-        .remote_path(&index_part_path)
-        .map_err(DownloadError::BadInput)?;
-
-    let mut index_part_download = storage.download(&part_storage_path).await?;
-
-    let mut index_part_bytes = Vec::new();
-    tokio::io::copy(
-        &mut index_part_download.download_stream,
-        &mut index_part_bytes,
-    )
-    .await
-    .with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
-    .map_err(DownloadError::Other)?;
-
-    let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| {
-            format!("Failed to deserialize index part file into file {index_part_path:?}")
-        })
-        .map_err(DownloadError::Other)?;
-
-    let index_part = index_part.remove_unclean_layer_file_names();
-
-    Ok(index_part)
-}
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 3325ce01d4..a1b3ad26b0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -25,7 +25,6 @@
 //! the current task has been requested to shut down. You can use that with
 //! Tokio select!().
 //!
-//!
 //! TODO: This would be a good place to also handle panics in a somewhat sane way.
 //! Depending on what task panics, we might want to kill the whole server, or
 //! only a single tenant or timeline.
@@ -36,6 +35,7 @@
 #![allow(clippy::declare_interior_mutable_const)]
 
 use std::collections::HashMap;
+use std::fmt;
 use std::future::Future;
 use std::panic::AssertUnwindSafe;
 use std::sync::atomic::{AtomicU64, Ordering};
@@ -43,9 +43,9 @@ use std::sync::{Arc, Mutex};
 
 use futures::FutureExt;
 use tokio::runtime::Runtime;
-use tokio::sync::watch;
 use tokio::task::JoinHandle;
 use tokio::task_local;
+use tokio_util::sync::CancellationToken;
 
 use tracing::{debug, error, info, warn};
 
@@ -135,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .expect("Failed to create background op runtime")
 });
 
+#[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
 
+impl fmt::Display for PageserverTaskId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Each task that we track is associated with a "task ID". It's just an
 /// increasing number that we assign. Note that it is different from tokio::task::Id.
 static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
@@ -146,11 +153,10 @@ static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
     Lazy::new(|| Mutex::new(HashMap::new()));
 
 task_local! {
-    // There is a Tokio watch channel for each task, which can be used to signal the
-    // task that it needs to shut down. This task local variable holds the receiving
-    // end of the channel. The sender is kept in the global registry, so that anyone
-    // can send the signal to request task shutdown.
-    static SHUTDOWN_RX: watch::Receiver<bool>;
+    // This is a cancellation token which will be cancelled when a task needs to shut down. The
+    // root token is kept in the global registry, so that anyone can send the signal to request
+    // task shutdown.
+    static SHUTDOWN_TOKEN: CancellationToken;
 
     // Each task holds reference to its own PageServerTask here.
     static CURRENT_TASK: Arc<PageServerTask>;
@@ -200,11 +206,20 @@ pub enum TaskKind {
     // Task that uploads a file to remote storage
     RemoteUploadTask,
 
+    // Task that downloads a file from remote storage
+    RemoteDownloadTask,
+
     // task that handles the initial downloading of all tenants
     InitialLoad,
 
     // task that handles attaching a tenant
     Attach,
+
+    // task that handhes metrics collection
+    MetricsCollection,
+
+    // task that drives downloading layers
+    DownloadAllRemoteLayers,
 }
 
 #[derive(Default)]
@@ -226,8 +241,8 @@ struct PageServerTask {
 
     name: String,
 
-    // To request task shutdown, send 'true' to the channel to notify the task.
-    shutdown_tx: watch::Sender<bool>,
+    // To request task shutdown, just cancel this token.
+    cancel: CancellationToken,
 
     mutable: Mutex<MutableTaskState>,
 }
@@ -247,13 +262,13 @@ pub fn spawn<F>(
 where
     F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
-    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let cancel = CancellationToken::new();
     let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
     let task = Arc::new(PageServerTask {
         task_id: PageserverTaskId(task_id),
         kind,
         name: name.to_string(),
-        shutdown_tx,
+        cancel: cancel.clone(),
         mutable: Mutex::new(MutableTaskState {
             tenant_id,
             timeline_id,
@@ -271,7 +286,7 @@ where
         task_name,
         task_id,
         task_cloned,
-        shutdown_rx,
+        cancel,
         shutdown_process_on_error,
         future,
     ));
@@ -288,7 +303,7 @@ async fn task_wrapper<F>(
     task_name: String,
     task_id: u64,
     task: Arc<PageServerTask>,
-    shutdown_rx: watch::Receiver<bool>,
+    shutdown_token: CancellationToken,
     shutdown_process_on_error: bool,
     future: F,
 ) where
@@ -296,9 +311,9 @@ async fn task_wrapper<F>(
 {
     debug!("Starting task '{}'", task_name);
 
-    let result = SHUTDOWN_RX
+    let result = SHUTDOWN_TOKEN
         .scope(
-            shutdown_rx,
+            shutdown_token,
             CURRENT_TASK.scope(task, {
                 // We use AssertUnwindSafe here so that the payload function
                 // doesn't need to be UnwindSafe. We don't do anything after the
@@ -408,7 +423,7 @@ pub async fn shutdown_tasks(
                 && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
                 && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
             {
-                let _ = task.shutdown_tx.send_replace(true);
+                task.cancel.cancel();
                 victim_tasks.push(Arc::clone(task));
             }
         }
@@ -436,24 +451,35 @@ pub fn current_task_kind() -> Option<TaskKind> {
     CURRENT_TASK.try_with(|ct| ct.kind).ok()
 }
 
+pub fn current_task_id() -> Option<PageserverTaskId> {
+    CURRENT_TASK.try_with(|ct| ct.task_id).ok()
+}
+
 /// A Future that can be used to check if the current task has been requested to
 /// shut down.
 pub async fn shutdown_watcher() {
-    let mut shutdown_rx = SHUTDOWN_RX
-        .try_with(|rx| rx.clone())
+    let token = SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
         .expect("shutdown_requested() called in an unexpected task or thread");
 
-    while !*shutdown_rx.borrow() {
-        if shutdown_rx.changed().await.is_err() {
-            break;
-        }
-    }
+    token.cancelled().await;
+}
+
+/// Clone the current task's cancellation token, which can be moved across tasks.
+///
+/// When the task which is currently executing is shutdown, the cancellation token will be
+/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
+/// `tokio::task::JoinSet::spawn`.
+pub fn shutdown_token() -> CancellationToken {
+    SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
+        .expect("shutdown_token() called in an unexpected task or thread")
 }
 
 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
-        *shutdown_rx.borrow()
+    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
+        cancel.is_cancelled()
     } else {
         if !cfg!(test) {
             warn!("is_shutdown_requested() called in an unexpected task or thread");
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4fcb1e3ba3..d74f263f08 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -13,13 +13,13 @@
 
 use anyhow::{bail, Context};
 use bytes::Bytes;
+use futures::FutureExt;
 use futures::Stream;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use tokio::sync::watch;
-use tokio_util::io::StreamReader;
-use tokio_util::io::SyncIoBridge;
+use tokio::task::JoinSet;
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 
@@ -36,7 +36,6 @@ use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
-use std::pin::Pin;
 use std::process::Command;
 use std::process::Stdio;
 use std::sync::Arc;
@@ -45,24 +44,25 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::metadata::TimelineMetadata;
+use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
 use crate::repository::GcResult;
-use crate::storage_sync::create_remote_timeline_client;
-use crate::storage_sync::index::IndexPart;
-use crate::storage_sync::list_remote_timelines;
-use crate::storage_sync::RemoteTimelineClient;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::storage_layer::DeltaLayer;
+use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
-use crate::tenant_config::TenantConfOpt;
+
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
-use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;
 pub use pageserver_api::models::TenantState;
 
 use toml_edit;
@@ -74,23 +74,25 @@ use utils::{
 
 mod blob_io;
 pub mod block_io;
-mod delta_layer;
 mod disk_btree;
 pub(crate) mod ephemeral_file;
-pub mod filename;
-mod image_layer;
-mod inmemory_layer;
 pub mod layer_map;
 
 pub mod metadata;
 mod par_fsync;
+mod remote_timeline_client;
 pub mod storage_layer;
 
+pub mod config;
+pub mod mgr;
+pub mod tasks;
+pub mod upload_queue;
+
 mod timeline;
 
 pub mod size;
 
-pub use timeline::Timeline;
+pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline};
 
 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -125,11 +127,11 @@ pub struct Tenant {
     timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
-    // `timelines` mutex during all GC iteration (especially with enforced checkpoint)
+    // `timelines` mutex during all GC iteration
     // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
     // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
     // timeout...
-    gc_cs: Mutex<()>,
+    gc_cs: tokio::sync::Mutex<()>,
     walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
 
     // provides access to timeline data sitting in the remote storage
@@ -233,23 +235,17 @@ impl UninitializedTimeline<'_> {
     /// Prepares timeline data by loading it from the basebackup archive.
     pub async fn import_basebackup_from_tar(
         self,
-        mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
+        copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
         base_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
-        // import_basebackup_from_tar() is not async, mainly because the Tar crate
-        // it uses is not async. So we need to jump through some hoops:
-        // - convert the input from client connection to a synchronous Read
-        // - use block_in_place()
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
+        let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
+        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+            .await
+            .context("Failed to import basebackup")?;
 
-        tokio::task::block_in_place(|| {
-            import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
-                .context("Failed to import basebackup")
-        })?;
-
-        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // Flush loop needs to be spawned in order to be able to flush.
         // We want to run proper checkpoint before we mark timeline as available to outside world
         // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
         raw_timeline.maybe_spawn_flush_loop();
@@ -259,9 +255,9 @@ impl UninitializedTimeline<'_> {
         });
 
         raw_timeline
-            .checkpoint(CheckpointConfig::Flush)
+            .freeze_and_flush()
             .await
-            .context("Failed to checkpoint after basebackup import")?;
+            .context("Failed to flush after basebackup import")?;
 
         let timeline = self.initialize()?;
 
@@ -336,7 +332,7 @@ impl TimelineUninitMark {
         let uninit_mark_parent = uninit_mark_file
             .parent()
             .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| {
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
             format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
         })?;
         crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
@@ -371,7 +367,7 @@ impl Drop for TimelineUninitMark {
 
 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
-//     Checkpoint comes, we update local metadata and start upload task but after that
+//     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
 //     pageserver crashes. During startup we'll load new metadata, and then reset it
 //     to the state of remote one. But current layermap will have layers from the old
 //     metadata which is inconsistent.
@@ -480,7 +476,7 @@ impl Tenant {
             let timeline = UninitializedTimeline {
                 owning_tenant: self,
                 timeline_id,
-                raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
+                raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
             };
             // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote
             // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver
@@ -510,7 +506,7 @@ impl Tenant {
                         )
                         })?;
                     broken_timeline.set_state(TimelineState::Broken);
-                    timelines_accessor.insert(timeline_id, Arc::new(broken_timeline));
+                    timelines_accessor.insert(timeline_id, broken_timeline);
                     Err(e)
                 }
             }
@@ -600,7 +596,7 @@ impl Tenant {
                 match tenant_clone.attach().await {
                     Ok(_) => {}
                     Err(e) => {
-                        tenant_clone.set_broken();
+                        tenant_clone.set_broken(&e.to_string());
                         error!("error attaching tenant: {:?}", e);
                     }
                 }
@@ -645,22 +641,62 @@ impl Tenant {
             .as_ref()
             .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
 
-        let remote_timelines =
-            list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
+        let remote_timeline_ids = remote_timeline_client::list_remote_timelines(
+            remote_storage,
+            self.conf,
+            self.tenant_id,
+        )
+        .await?;
 
-        info!("found {} timelines", remote_timelines.len());
+        info!("found {} timelines", remote_timeline_ids.len());
 
-        let mut timeline_ancestors: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        let mut index_parts: HashMap<TimelineId, IndexPart> = HashMap::new();
-        for (timeline_id, index_part) in remote_timelines {
-            let remote_metadata = index_part.parse_metadata().with_context(|| {
-                format!(
-                    "Failed to parse metadata file from remote storage for tenant {} timeline {}",
-                    self.tenant_id, timeline_id
-                )
-            })?;
+        // Download & parse index parts
+        let mut part_downloads = JoinSet::new();
+        for timeline_id in remote_timeline_ids {
+            let client = RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            );
+            part_downloads.spawn(
+                async move {
+                    debug!("starting index part download");
+
+                    let index_part = client
+                        .download_index_file()
+                        .await
+                        .context("download index file")?;
+
+                    let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
+
+                    debug!("finished index part download");
+
+                    Result::<_, anyhow::Error>::Ok((
+                        timeline_id,
+                        client,
+                        index_part,
+                        remote_metadata,
+                    ))
+                }
+                .map(move |res| {
+                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
+                })
+                .instrument(info_span!("download_index_part", timeline=%timeline_id)),
+            );
+        }
+        // Wait for all the download tasks to complete & collect results.
+        let mut remote_clients = HashMap::new();
+        let mut index_parts = HashMap::new();
+        let mut timeline_ancestors = HashMap::new();
+        while let Some(result) = part_downloads.join_next().await {
+            // NB: we already added timeline_id as context to the error
+            let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
+            let (timeline_id, client, index_part, remote_metadata) = result?;
+            debug!("successfully downloaded index part for timeline {timeline_id}");
             timeline_ancestors.insert(timeline_id, remote_metadata);
             index_parts.insert(timeline_id, index_part);
+            remote_clients.insert(timeline_id, client);
         }
 
         // For every timeline, download the metadata file, scan the local directory,
@@ -673,7 +709,7 @@ impl Tenant {
                 timeline_id,
                 index_parts.remove(&timeline_id).unwrap(),
                 remote_metadata,
-                remote_storage.clone(),
+                remote_clients.remove(&timeline_id).unwrap(),
             )
             .await
             .with_context(|| {
@@ -700,22 +736,35 @@ impl Tenant {
         Ok(())
     }
 
-    #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))]
+    /// get size of all remote timelines
+    ///
+    /// This function relies on the index_part instead of listing the remote storage
+    ///
+    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+        let mut size = 0;
+
+        for timeline in self.list_timelines().iter() {
+            if let Some(remote_client) = &timeline.remote_client {
+                size += remote_client.get_remote_physical_size();
+            }
+        }
+
+        Ok(size)
+    }
+
+    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
     async fn load_remote_timeline(
         &self,
         timeline_id: TimelineId,
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
-        remote_storage: GenericRemoteStorage,
+        remote_client: RemoteTimelineClient,
     ) -> anyhow::Result<()> {
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
             .await
             .context("Failed to create new timeline directory")?;
 
-        let remote_client =
-            create_remote_timeline_client(remote_storage, self.conf, self.tenant_id, timeline_id)?;
-
         let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
             let timelines = self.timelines.lock().unwrap();
             Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
@@ -811,7 +860,7 @@ impl Tenant {
                 match tenant_clone.load().await {
                     Ok(()) => {}
                     Err(err) => {
-                        tenant_clone.set_broken();
+                        tenant_clone.set_broken(&err.to_string());
                         error!("could not load tenant {tenant_id}: {err:?}");
                     }
                 }
@@ -972,18 +1021,14 @@ impl Tenant {
             None
         };
 
-        let remote_client = self
-            .remote_storage
-            .as_ref()
-            .map(|remote_storage| {
-                create_remote_timeline_client(
-                    remote_storage.clone(),
-                    self.conf,
-                    self.tenant_id,
-                    timeline_id,
-                )
-            })
-            .transpose()?;
+        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
+            RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            )
+        });
 
         let remote_startup_data = match &remote_client {
             Some(remote_client) => match remote_client.download_index_file().await {
@@ -1142,7 +1187,8 @@ impl Tenant {
                     ancestor_timeline.wait_lsn(*lsn).await?;
                 }
 
-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
+                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                    .await?
             }
             None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
         };
@@ -1154,17 +1200,20 @@ impl Tenant {
     /// this function is periodically called by gc task.
     /// also it can be explicitly requested through page server api 'do_gc' command.
     ///
-    /// 'target_timeline_id' specifies the timeline to GC, or None for all.
-    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
-    /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
-    /// to make tests more deterministic.
-    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
+    /// `target_timeline_id` specifies the timeline to GC, or None for all.
+    ///
+    /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained.
+    /// Also known as the retention period, or the GC cutoff point. `horizon` specifies
+    /// the amount of history, as LSN difference from current latest LSN on each timeline.
+    /// `pitr` specifies the same as a time difference from the current time. The effective
+    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
+    /// requires more history to be retained.
+    //
     pub async fn gc_iteration(
         &self,
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
-        checkpoint_before_gc: bool,
     ) -> anyhow::Result<GcResult> {
         anyhow::ensure!(
             self.is_active(),
@@ -1179,7 +1228,7 @@ impl Tenant {
             let _timer = STORAGE_TIME
                 .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
                 .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
+            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
                 .await
         }
     }
@@ -1222,24 +1271,21 @@ impl Tenant {
     ///
     /// Used at graceful shutdown.
     ///
-    pub async fn checkpoint(&self) -> anyhow::Result<()> {
+    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
         // Scan through the hashmap and collect a list of all the timelines,
         // while holding the lock. Then drop the lock and actually perform the
-        // checkpoints. We don't want to block everything else while the
-        // checkpoint runs.
-        let timelines_to_checkpoint = {
+        // flushing. We don't want to block everything else while the
+        // flushing is performed.
+        let timelines_to_flush = {
             let timelines = self.timelines.lock().unwrap();
             timelines
                 .iter()
-                .map(|(id, timeline)| (*id, Arc::clone(timeline)))
+                .map(|(_id, timeline)| Arc::clone(timeline))
                 .collect::<Vec<_>>()
         };
 
-        for (id, timeline) in &timelines_to_checkpoint {
-            timeline
-                .checkpoint(CheckpointConfig::Flush)
-                .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
-                .await?;
+        for timeline in &timelines_to_flush {
+            timeline.freeze_and_flush().await?;
         }
 
         Ok(())
@@ -1274,26 +1320,62 @@ impl Tenant {
             timeline
         };
 
-        info!("waiting for layer_removal_cs.lock()");
-        // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
-        let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-        info!("got layer_removal_cs.lock(), deleting layer files");
+        // Now that the Timeline is in Stopping state, request all the related tasks to
+        // shut down.
+        //
+        // NB: If you call delete_timeline multiple times concurrently, they will
+        // all go through the motions here. Make sure the code here is idempotent,
+        // and don't error out if some of the shutdown tasks have already been
+        // completed!
 
-        // NB: storage_sync upload tasks that reference these layers have been cancelled
-        //     by the caller.
+        // Stop the walreceiver first.
+        debug!("waiting for wal receiver to shutdown");
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_id),
+            Some(timeline_id),
+        )
+        .await;
+        debug!("wal receiver shutdown confirmed");
 
-        let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-        // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-        // with some layers missing.
-        std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
-            format!(
-                "Failed to remove local timeline directory '{}'",
-                local_timeline_directory.display()
-            )
-        })?;
-        info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+        info!("waiting for timeline tasks to shutdown");
+        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
 
-        drop(layer_removal_guard);
+        {
+            // Grab the layer_removal_cs lock, and actually perform the deletion.
+            //
+            // This lock prevents multiple concurrent delete_timeline calls from
+            // stepping on each other's toes, while deleting the files. It also
+            // prevents GC or compaction from running at the same time.
+            //
+            // Note that there are still other race conditions between
+            // GC, compaction and timeline deletion. GC task doesn't
+            // register itself properly with the timeline it's
+            // operating on. See
+            // https://github.com/neondatabase/neon/issues/2671
+            //
+            // No timeout here, GC & Compaction should be responsive to the
+            // `TimelineState::Stopping` change.
+            info!("waiting for layer_removal_cs.lock()");
+            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+            info!("got layer_removal_cs.lock(), deleting layer files");
+
+            // NB: storage_sync upload tasks that reference these layers have been cancelled
+            //     by the caller.
+
+            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
+            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
+            // with some layers missing.
+            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
+                format!(
+                    "Failed to remove local timeline directory '{}'",
+                    local_timeline_directory.display()
+                )
+            })?;
+
+            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            drop(layer_removal_guard);
+        }
 
         // Remove the timeline from the map.
         let mut timelines = self.timelines.lock().unwrap();
@@ -1371,7 +1453,7 @@ impl Tenant {
 
                     // Spawn gc and compaction loops. The loops will shut themselves
                     // down when they notice that the tenant is inactive.
-                    crate::tenant_tasks::start_background_loops(self.tenant_id);
+                    tasks::start_background_loops(self.tenant_id);
 
                     for timeline in not_broken_timelines {
                         timeline.set_state(TimelineState::Active);
@@ -1414,7 +1496,7 @@ impl Tenant {
         });
     }
 
-    pub fn set_broken(&self) {
+    pub fn set_broken(&self, reason: &str) {
         self.state.send_modify(|current_state| {
             match *current_state {
                 TenantState::Active => {
@@ -1423,18 +1505,22 @@ impl Tenant {
                     // activated should never be marked as broken. We cope with it the best
                     // we can, but it shouldn't happen.
                     *current_state = TenantState::Broken;
-                    warn!("Changing Active tenant to Broken state");
+                    warn!("Changing Active tenant to Broken state, reason: {}", reason);
                 }
                 TenantState::Broken => {
                     // This shouldn't happen either
-                    warn!("Tenant is already broken");
+                    warn!("Tenant is already in Broken state");
                 }
                 TenantState::Stopping => {
                     // This shouldn't happen either
                     *current_state = TenantState::Broken;
-                    warn!("Marking Stopping tenant as Broken");
+                    warn!(
+                        "Marking Stopping tenant as Broken state, reason: {}",
+                        reason
+                    );
                 }
                 TenantState::Loading | TenantState::Attaching => {
+                    info!("Setting tenant as Broken state, reason: {}", reason);
                     *current_state = TenantState::Broken;
                 }
             }
@@ -1595,7 +1681,7 @@ impl Tenant {
         new_metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
         if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
             anyhow::ensure!(
                 ancestor.is_some(),
@@ -1631,7 +1717,7 @@ impl Tenant {
             conf,
             tenant_conf: Arc::new(RwLock::new(tenant_conf)),
             timelines: Mutex::new(HashMap::new()),
-            gc_cs: Mutex::new(()),
+            gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
             state,
@@ -1778,16 +1864,22 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
-        checkpoint_before_gc: bool,
     ) -> anyhow::Result<GcResult> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
-        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
+        let gc_timelines = self
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await?;
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
-        info!("starting on {} timelines", gc_timelines.len());
+        // If there is nothing to GC, we don't want any messages in the INFO log.
+        if !gc_timelines.is_empty() {
+            info!("{} timelines need GC", gc_timelines.len());
+        } else {
+            debug!("{} timelines need GC", gc_timelines.len());
+        }
 
         // Perform GC for each timeline.
         //
@@ -1805,18 +1897,6 @@ impl Tenant {
                 // made.
                 break;
             }
-
-            // If requested, force flush all in-memory layers to disk first,
-            // so that they too can be garbage collected. That's
-            // used in tests, so we want as deterministic results as possible.
-            if checkpoint_before_gc {
-                timeline.checkpoint(CheckpointConfig::Forced).await?;
-                info!(
-                    "timeline {} checkpoint_before_gc done",
-                    timeline.timeline_id
-                );
-            }
-
             let result = timeline.gc().await?;
             totals += result;
         }
@@ -1830,7 +1910,7 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -1841,54 +1921,60 @@ impl Tenant {
         let target_timeline_id = None;
 
         self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await
     }
 
-    fn refresh_gc_info_internal(
+    async fn refresh_gc_info_internal(
         &self,
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // grab mutex to prevent new timelines from being created here.
-        let gc_cs = self.gc_cs.lock().unwrap();
-
-        let timelines = self.timelines.lock().unwrap();
+        let gc_cs = self.gc_cs.lock().await;
 
         // Scan all timelines. For each timeline, remember the timeline ID and
         // the branch point where it was created.
-        let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new();
-        let timeline_ids = {
-            if let Some(target_timeline_id) = target_timeline_id.as_ref() {
-                if timelines.get(target_timeline_id).is_none() {
-                    bail!("gc target timeline does not exist")
-                }
-            };
+        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+            let timelines = self.timelines.lock().unwrap();
+            let mut all_branchpoints = BTreeSet::new();
+            let timeline_ids = {
+                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
+                    if timelines.get(target_timeline_id).is_none() {
+                        bail!("gc target timeline does not exist")
+                    }
+                };
 
-            timelines
-                .iter()
-                .map(|(timeline_id, timeline_entry)| {
-                    if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
-                        // If target_timeline is specified, we only need to know branchpoints of its children
-                        if let Some(timeline_id) = target_timeline_id {
-                            if ancestor_timeline_id == &timeline_id {
+                timelines
+                    .iter()
+                    .map(|(timeline_id, timeline_entry)| {
+                        if let Some(ancestor_timeline_id) =
+                            &timeline_entry.get_ancestor_timeline_id()
+                        {
+                            // If target_timeline is specified, we only need to know branchpoints of its children
+                            if let Some(timeline_id) = target_timeline_id {
+                                if ancestor_timeline_id == &timeline_id {
+                                    all_branchpoints.insert((
+                                        *ancestor_timeline_id,
+                                        timeline_entry.get_ancestor_lsn(),
+                                    ));
+                                }
+                            }
+                            // Collect branchpoints for all timelines
+                            else {
                                 all_branchpoints.insert((
                                     *ancestor_timeline_id,
                                     timeline_entry.get_ancestor_lsn(),
                                 ));
                             }
                         }
-                        // Collect branchpoints for all timelines
-                        else {
-                            all_branchpoints
-                                .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn()));
-                        }
-                    }
 
-                    *timeline_id
-                })
-                .collect::<Vec<_>>()
+                        *timeline_id
+                    })
+                    .collect::<Vec<_>>()
+            };
+            (all_branchpoints, timeline_ids)
         };
-        drop(timelines);
 
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
@@ -1914,7 +2000,7 @@ impl Tenant {
                     ))
                     .map(|&x| x.1)
                     .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr)?;
+                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
 
                 gc_timelines.push(timeline);
             }
@@ -1924,7 +2010,7 @@ impl Tenant {
     }
 
     /// Branch an existing timeline
-    fn branch_timeline(
+    async fn branch_timeline(
         &self,
         src: TimelineId,
         dst: TimelineId,
@@ -1933,10 +2019,11 @@ impl Tenant {
         // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
         // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
         // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().unwrap();
-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?;
-        drop(timelines);
+        let _gc_cs = self.gc_cs.lock().await;
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst, &timelines)?
+        };
 
         // In order for the branch creation task to not wait for GC/compaction,
         // we need to make sure that the starting LSN of the child branch is not out of scope midway by
@@ -2083,13 +2170,12 @@ impl Tenant {
         let tenant_id = raw_timeline.owning_tenant.tenant_id;
         let unfinished_timeline = raw_timeline.raw_timeline()?;
 
-        tokio::task::block_in_place(|| {
-            import_datadir::import_timeline_from_postgres_datadir(
-                unfinished_timeline,
-                pgdata_path,
-                pgdata_lsn,
-            )
-        })
+        import_datadir::import_timeline_from_postgres_datadir(
+            unfinished_timeline,
+            pgdata_path,
+            pgdata_lsn,
+        )
+        .await
         .with_context(|| {
             format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
         })?;
@@ -2105,8 +2191,13 @@ impl Tenant {
         });
 
         unfinished_timeline
-            .checkpoint(CheckpointConfig::Flush).await
-            .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
+            .freeze_and_flush()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}"
+                )
+            })?;
 
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
@@ -2135,12 +2226,12 @@ impl Tenant {
         let tenant_id = self.tenant_id;
 
         let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = create_remote_timeline_client(
+            let remote_client = RemoteTimelineClient::new(
                 remote_storage.clone(),
                 self.conf,
                 tenant_id,
                 new_timeline_id,
-            )?;
+            );
             remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
             Some(remote_client)
         } else {
@@ -2165,7 +2256,7 @@ impl Tenant {
                 Ok(UninitializedTimeline {
                     owning_tenant: self,
                     timeline_id: new_timeline_id,
-                    raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
+                    raw_timeline: Some((new_timeline, uninit_mark)),
                 })
             }
             Err(e) => {
@@ -2183,7 +2274,7 @@ impl Tenant {
         new_metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_data = self
             .create_timeline_data(
                 new_timeline_id,
@@ -2266,12 +2357,12 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
     }
 }
 
 fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
-    fs::remove_dir_all(&timeline_dir)
+    fs::remove_dir_all(timeline_dir)
         .or_else(|e| {
             if e.kind() == std::io::ErrorKind::NotFound {
                 // we can leave the uninit mark without a timeline dir,
@@ -2287,7 +2378,7 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a
                 timeline_dir.display()
             )
         })?;
-    fs::remove_file(&uninit_mark).with_context(|| {
+    fs::remove_file(uninit_mark).with_context(|| {
         format!(
             "Failed to remove timeline uninit mark file {}",
             uninit_mark.display()
@@ -2387,7 +2478,7 @@ fn try_create_target_tenant_dir(
         anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
     });
 
-    fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
+    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
             "failed to move tenant {} temporary directory {} into the permanent one {}",
             tenant_id,
@@ -2441,9 +2532,9 @@ fn run_initdb(
     );
 
     let initdb_output = Command::new(&initdb_bin_path)
-        .args(&["-D", &initdb_target_dir.to_string_lossy()])
-        .args(&["-U", &conf.superuser])
-        .args(&["-E", "utf8"])
+        .args(["-D", &initdb_target_dir.to_string_lossy()])
+        .args(["-U", &conf.superuser])
+        .args(["-E", "utf8"])
         .arg("--no-instructions")
         // This is only used for a temporary installation that is deleted shortly after,
         // so no need to fsync it
@@ -2486,12 +2577,8 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<()
     file.read_exact_at(&mut header_buf, 0)?;
 
     match u16::from_be_bytes(header_buf) {
-        crate::IMAGE_FILE_MAGIC => {
-            image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)?
-        }
-        crate::DELTA_FILE_MAGIC => {
-            delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)?
-        }
+        crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?,
+        crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?,
         magic => bail!("unrecognized magic identifier: {:?}", magic),
     }
 
@@ -2528,7 +2615,7 @@ pub mod harness {
     };
 
     use super::*;
-    use crate::tenant_config::{TenantConf, TenantConfOpt};
+    use crate::tenant::config::{TenantConf, TenantConfOpt};
     use hex_literal::hex;
     use utils::id::{TenantId, TimelineId};
 
@@ -2605,9 +2692,11 @@ pub mod harness {
 
             // Disable automatic GC and compaction to make the unit tests more deterministic.
             // The tests perform them manually if needed.
-            let mut tenant_conf = TenantConf::dummy_conf();
-            tenant_conf.gc_period = Duration::ZERO;
-            tenant_conf.compaction_period = Duration::ZERO;
+            let tenant_conf = TenantConf {
+                gc_period: Duration::ZERO,
+                compaction_period: Duration::ZERO,
+                ..TenantConf::default()
+            };
 
             let tenant_id = TenantId::generate();
             fs::create_dir_all(conf.tenant_path(&tenant_id))?;
@@ -2726,9 +2815,18 @@ mod tests {
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );
 
         Ok(())
     }
@@ -2793,7 +2891,9 @@ mod tests {
         //assert_current_logical_size(&tline, Lsn(0x40));
 
         // Branch the history, modify relation differently on the new timeline
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -2803,15 +2903,15 @@ mod tests {
 
         // Check page contents on both branches
         assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
             "foo at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
             "bar at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?,
             "foobar at 0x20"
         );
 
@@ -2841,7 +2941,7 @@ mod tests {
             writer.finish_write(lsn);
             lsn += 0x10;
         }
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         {
             let writer = tline.writer();
             writer.put(
@@ -2858,7 +2958,7 @@ mod tests {
             )?;
             writer.finish_write(lsn);
         }
-        tline.checkpoint(CheckpointConfig::Forced).await
+        tline.freeze_and_flush().await
     }
 
     #[tokio::test]
@@ -2873,15 +2973,18 @@ mod tests {
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        // FIXME: this doesn't actually remove any layer currently, given how the checkpointing
+        // FIXME: this doesn't actually remove any layer currently, given how the flushing
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
                 assert!(err.to_string().contains("invalid branch start lsn"));
@@ -2906,7 +3009,10 @@ mod tests {
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
             .initialize()?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
                 assert!(&err.to_string().contains("invalid branch start lsn"));
@@ -2933,7 +3039,7 @@ mod tests {
         let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
         let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
         assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
         match tline.get(*TEST_KEY, Lsn(0x25)) {
@@ -2954,15 +3060,20 @@ mod tests {
             .initialize()?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
+        assert!(newtline
+            .get(*TEST_KEY, Lsn(0x25))
+            .no_ondemand_download()
+            .is_ok());
 
         Ok(())
     }
@@ -2976,7 +3087,9 @@ mod tests {
             .initialize()?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -2985,12 +3098,12 @@ mod tests {
 
         // run gc on parent
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
 
         // Check that the data is still accessible on the branch.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50))?,
+            newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?,
             TEST_IMG(&format!("foo at {}", Lsn(0x40)))
         );
 
@@ -3007,7 +3120,6 @@ mod tests {
                 .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
                 .initialize()?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
         }
 
         let tenant = harness.load().await;
@@ -3030,16 +3142,16 @@ mod tests {
                 .initialize()?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
 
-            tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+            tenant
+                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .await?;
 
             let newtline = tenant
                 .get_timeline(NEW_TIMELINE_ID, true)
                 .expect("Should have a local timeline");
 
             make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
         }
 
         // check that both of them are initially unloaded
@@ -3111,7 +3223,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3119,7 +3231,7 @@ mod tests {
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3127,7 +3239,7 @@ mod tests {
         writer.finish_write(Lsn(0x30));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3135,21 +3247,36 @@ mod tests {
         writer.finish_write(Lsn(0x40));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x30")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x40")
+        );
 
         Ok(())
     }
 
     //
-    // Insert 1000 key-value pairs with increasing keys, checkpoint,
-    // repeat 50 times.
+    // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
+    // Repeat 50 times.
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
@@ -3184,8 +3311,10 @@ mod tests {
 
             let cutoff = tline.get_last_record_lsn();
 
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
@@ -3248,16 +3377,17 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
 
-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
@@ -3305,7 +3435,9 @@ mod tests {
         let mut tline_id = TIMELINE_ID;
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
@@ -3331,16 +3463,17 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
 
-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
@@ -3370,7 +3503,9 @@ mod tests {
         #[allow(clippy::needless_range_loop)]
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
@@ -3403,7 +3538,7 @@ mod tests {
                 println!("checking [{idx}][{blknum}] at {lsn}");
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, *lsn)?,
+                    tline.get(test_key, *lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant/config.rs
similarity index 89%
rename from pageserver/src/tenant_config.rs
rename to pageserver/src/tenant/config.rs
index 1204d1abd8..c95a98fbc7 100644
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -30,7 +30,7 @@ pub mod defaults {
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
     pub const DEFAULT_GC_PERIOD: &str = "100 s";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
+    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
@@ -191,11 +191,10 @@ impl TenantConfOpt {
     }
 }
 
-impl TenantConf {
-    pub fn default() -> TenantConf {
+impl Default for TenantConf {
+    fn default() -> Self {
         use defaults::*;
-
-        TenantConf {
+        Self {
             checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
             checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
                 .expect("cannot parse default checkpoint timeout"),
@@ -220,29 +219,4 @@ impl TenantConf {
             trace_read_requests: false,
         }
     }
-
-    pub fn dummy_conf() -> Self {
-        TenantConf {
-            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: Duration::from_secs(600),
-            compaction_target_size: 4 * 1024 * 1024,
-            compaction_period: Duration::from_secs(10),
-            compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
-            gc_horizon: defaults::DEFAULT_GC_HORIZON,
-            gc_period: Duration::from_secs(10),
-            image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: Duration::from_secs(60 * 60),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .unwrap(),
-            lagging_wal_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
-            )
-            .unwrap(),
-            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .unwrap(),
-            trace_read_requests: false,
-        }
-    }
 }
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 33255dbd82..88dff32b76 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
         off += keys_len as u64;
 
         let values_off = off as usize;
-        let values_len = num_children as usize * VALUE_SZ as usize;
+        let values_len = num_children as usize * VALUE_SZ;
         //off += values_len as u64;
 
         let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
@@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
         while low < high {
             let mid = low + size / 2;
 
-            let key_off = mid as usize * self.suffix_len as usize;
+            let key_off = mid * self.suffix_len as usize;
             let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
             // Does this match?
             keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
@@ -328,7 +328,7 @@ where
             while idx < node.num_children as usize {
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                 #[allow(clippy::collapsible_if)]
                 if node.level == 0 {
                     // leaf
@@ -368,7 +368,7 @@ where
                 key_off -= suffix_len;
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                 #[allow(clippy::collapsible_if)]
                 if node.level == 0 {
                     // leaf
@@ -629,7 +629,7 @@ impl<const L: usize> BuildNode<L> {
         self.keys.extend(&key[self.prefix.len()..]);
         self.values.extend(value.0);
 
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
 
         self.size += self.suffix_len + VALUE_SZ;
@@ -674,7 +674,7 @@ impl<const L: usize> BuildNode<L> {
         self.size -= prefix_len * self.num_children as usize;
         self.size += prefix_len;
 
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
 
         true
@@ -684,7 +684,7 @@ impl<const L: usize> BuildNode<L> {
     /// Serialize the node to on-disk format.
     ///
     fn pack(&self) -> Bytes {
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
         assert!(self.num_children > 0);
 
@@ -940,7 +940,7 @@ mod tests {
             let t = -(f64::ln(u));
             let key_int = (t * 1000000.0) as u128;
 
-            all_data.insert(key_int as u128, idx as u64);
+            all_data.insert(key_int, idx as u64);
         }
 
         // Build a tree from it
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 0774fa42a6..c433e65ad2 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -91,7 +91,7 @@ impl EphemeralFile {
                 break;
             }
 
-            off += n as usize;
+            off += n;
         }
         Ok(())
     }
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 19252ecf6e..44bed5959f 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -12,7 +12,6 @@
 
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::inmemory_layer::InMemoryLayer;
 use crate::tenant::storage_layer::{range_eq, range_overlaps};
 use amplify_num::i256;
 use anyhow::Result;
@@ -27,7 +26,7 @@ use std::sync::Arc;
 use tracing::*;
 use utils::lsn::Lsn;
 
-use super::storage_layer::Layer;
+use super::storage_layer::{InMemoryLayer, Layer};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -261,8 +260,10 @@ where
     /// contain the version, even if it's missing from the returned
     /// layer.
     ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult<L>>> {
-        // linear search
+    /// NOTE: This only searches the 'historic' layers, *not* the
+    /// 'open' and 'frozen' layers!
+    ///
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
         // Find the latest image layer that covers the given key
         let mut latest_img: Option<Arc<L>> = None;
         let mut latest_img_lsn: Option<Lsn> = None;
@@ -286,10 +287,10 @@ where
             assert!(img_lsn < end_lsn);
             if Lsn(img_lsn.0 + 1) == end_lsn {
                 // found exact match
-                return Ok(Some(SearchResult {
+                return Some(SearchResult {
                     layer: Arc::clone(l),
                     lsn_floor: img_lsn,
-                }));
+                });
             }
             if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
                 latest_img = Some(Arc::clone(l));
@@ -327,14 +328,16 @@ where
                 latest_delta.replace(Arc::clone(l));
                 break;
             }
-            // this layer's end LSN is smaller than the requested point. If there's
-            // nothing newer, this is what we need to return. Remember this.
-            if let Some(old_candidate) = &latest_delta {
-                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
+                // this layer's end LSN is smaller than the requested point. If there's
+                // nothing newer, this is what we need to return. Remember this.
+                if let Some(old_candidate) = &latest_delta {
+                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+                        latest_delta.replace(Arc::clone(l));
+                    }
+                } else {
                     latest_delta.replace(Arc::clone(l));
                 }
-            } else {
-                latest_delta.replace(Arc::clone(l));
             }
         }
         if let Some(l) = latest_delta {
@@ -346,19 +349,19 @@ where
                 Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
                 l.get_lsn_range().start,
             );
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                 lsn_floor,
                 layer: l,
-            }))
+            })
         } else if let Some(l) = latest_img {
             trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                 lsn_floor: latest_img_lsn.unwrap(),
                 layer: l,
-            }))
+            })
         } else {
             trace!("no layer found for request on {key} at {end_lsn}");
-            Ok(None)
+            None
         }
     }
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index f3a0a5171a..297cccbe30 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -255,8 +255,7 @@ pub fn save_metadata(
     // fsync the parent directory to ensure the directory entry is durable
     if first_save {
         let timeline_dir = File::open(
-            &path
-                .parent()
+            path.parent()
                 .expect("Metadata should always have a parent dir"),
         )?;
         timeline_dir.sync_all()?;
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant/mgr.rs
similarity index 91%
rename from pageserver/src/tenant_mgr.rs
rename to pageserver/src/tenant/mgr.rs
index f4f1eba717..dce7cd8bae 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -17,8 +17,8 @@ use utils::crashsafe;
 
 use crate::config::PageServerConf;
 use crate::task_mgr::{self, TaskKind};
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_config::TenantConfOpt;
 use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::fs_ext::PathExt;
@@ -196,7 +196,7 @@ pub async fn shutdown_all_tenants() {
         let tenant_id = tenant.tenant_id();
         debug!("shutdown tenant {tenant_id}");
 
-        if let Err(err) = tenant.checkpoint().await {
+        if let Err(err) = tenant.freeze_and_flush().await {
             error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
         }
     }
@@ -216,8 +216,7 @@ pub async fn create_tenant(
         hash_map::Entry::Vacant(v) => {
             // Hold the write_tenants() lock, since all of this is local IO.
             // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory =
-                super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
+            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
             let created_tenant =
                 schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
             let crated_tenant_id = created_tenant.tenant_id();
@@ -262,27 +261,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
 }
 
 pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
-    // Start with the shutdown of timeline tasks (this shuts down the walreceiver)
-    // It is important that we do not take locks here, and do not check whether the timeline exists
-    // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
-    // we cannot create new timelines and tenants, and that can take quite some time,
-    // it can even become stuck due to a bug making whole pageserver unavailable for some operations
-    // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
-    // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
-    // will synchronize and either fail with the not found error or succeed
-
-    debug!("waiting for wal receiver to shutdown");
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::WalReceiverManager),
-        Some(tenant_id),
-        Some(timeline_id),
-    )
-    .await;
-    debug!("wal receiver shutdown confirmed");
-
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
-    info!("timeline task shutdown completed");
     match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
             tenant.delete_timeline(timeline_id).await?;
@@ -452,7 +430,7 @@ where
         Err(e) => {
             let tenants_accessor = TENANTS.read().await;
             match tenants_accessor.get(&tenant_id) {
-                Some(tenant) => tenant.set_broken(),
+                Some(tenant) => tenant.set_broken(&e.to_string()),
                 None => warn!("Tenant {tenant_id} got removed from memory"),
             }
             Err(e)
@@ -496,7 +474,7 @@ pub async fn immediate_gc(
         async move {
             fail::fail_point!("immediate_gc_task_pre");
             let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
                 .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
@@ -514,3 +492,53 @@ pub async fn immediate_gc(
 
     Ok(wait_task_done)
 }
+
+#[cfg(feature = "testing")]
+pub async fn immediate_compact(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
+    let guard = TENANTS.read().await;
+
+    let tenant = guard
+        .get(&tenant_id)
+        .map(Arc::clone)
+        .with_context(|| format!("Tenant {tenant_id} not found"))
+        .map_err(ApiError::NotFound)?;
+
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+
+    // Run in task_mgr to avoid race with detach operation
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        Some(timeline_id),
+        &format!(
+            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
+        ),
+        false,
+        async move {
+            let result = timeline
+                .compact()
+                .instrument(
+                    info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
+                )
+                .await;
+
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send compaction result: {result:?}"),
+            }
+            Ok(())
+        },
+    );
+
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);
+
+    Ok(wait_task_done)
+}
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/tenant/remote_timeline_client.rs
similarity index 76%
rename from pageserver/src/storage_sync2.rs
rename to pageserver/src/tenant/remote_timeline_client.rs
index 55dbeaff73..1db69d8b73 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -32,7 +32,8 @@
 //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
 //!
 //! - [`RemoteTimelineClient::schedule_layer_file_upload`]  when we've created a new layer file.
-//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
 //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
 //!
 //! Internally, these functions create [`UploadOp`]s and put them in a queue.
@@ -57,7 +58,7 @@
 //! To have a consistent remote structure, it's important that uploads and
 //! deletions are performed in the right order. For example, the index file
 //! contains a list of layer files, so it must not be uploaded until all the
-//! layer files that are in its list have been succesfully uploaded.
+//! layer files that are in its list have been successfully uploaded.
 //!
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
@@ -139,7 +140,7 @@
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
 //! the file is leaked in the remote storage. Similarly, if a new file is created
-//! and uploaded, but the pageserver dies permantently before updating the
+//! and uploaded, but the pageserver dies permanently before updating the
 //! remote index file, the new file is leaked in remote storage. We accept and
 //! tolerate that for now.
 //! Note further that we cannot easily fix this by scheduling deletes for every
@@ -147,31 +148,43 @@
 //! following two cases:
 //! - (1) We had the file locally, deleted it locally, scheduled a remote delete,
 //!   but crashed before it finished remotely.
-//! - (2) We never had the file locally because we were still in tenant attach
-//!   when we crashed. (Similar case for on-demand download in the future.)
+//! - (2) We never had the file locally because we haven't on-demand downloaded
+//!   it yet.
 //!
-//! # Downloads (= Tenant Attach)
+//! # Downloads
 //!
 //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
-//! downloading files from the remote storage. Downloads are performed immediately,
-//! independently of the uploads.
+//! downloading files from the remote storage. Downloads are performed immediately
+//! against the `RemoteStorage`, independently of the upload queue.
 //!
 //! When we attach a tenant, we perform the following steps:
 //! - create `Tenant` object in `TenantState::Attaching` state
-//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
-//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart`
-//! - eagerly download all the remote layers using the client's download APIs
-//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state.
+//! - List timelines that are present in remote storage, and for each:
+//!   - download their remote [`IndexPart`]s
+//!   - create `Timeline` struct and a `RemoteTimelineClient`
+//!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
+//!     but not present locally
+//!   - schedule uploads for layers that are only present locally.
+//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
+//!     the local filesystem, write the remote metadata to the local filesystem
+//! - After the above is done for each timeline, open the tenant for business by
+//!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
+//!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! Most of the above happens in [`Timeline::reconcile_with_remote`].
+//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
-//! file on the local disk.
-//! However, the distinction is moot for storage sync since we call
-//! `reconcile_with_remote` for tenants both with and without the marker file.
-//!
-//! In the future, downloading will be done on-demand and `reconcile_with_remote`
-//! will only be responsible for re-scheduling upload ops after a crash of an
-//! `Active` tenant.
+//! file on the local disk. This is critical because, when we restart the pageserver,
+//! we do not want to do the `List timelines` step for each tenant that has already
+//! been successfully attached (for performance & cost reasons).
+//! Instead, for a tenant without the attach marker file, we assume that the
+//! local state is in sync or ahead of the remote state. This includes the list
+//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
+//! if there's a timeline on the remote that the pageserver doesn't know about,
+//! the GC will not consider its branch point, leading to data loss.
+//! So, for a tenant with the attach marker file, we know that we do not yet have
+//! persisted all the remote timeline's metadata files locally. To exclude the
+//! risk above, we re-run the procedure for such tenants
 //!
 //! # Operating Without Remote Storage
 //!
@@ -194,39 +207,51 @@ mod upload;
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 
-use std::collections::{HashMap, VecDeque};
-use std::fmt::Debug;
-use std::ops::DerefMut;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
+use std::ops::DerefMut;
 use tokio::runtime::Runtime;
 use tracing::{debug, info, warn};
 use tracing::{info_span, Instrument};
-
 use utils::lsn::Lsn;
 
-use self::index::IndexPart;
-
-use crate::metrics::MeasureRemoteOp;
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
-use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
-use crate::tenant::filename::LayerFileName;
+use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
     config::PageServerConf,
-    storage_sync::index::LayerFileMetadata,
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::BACKGROUND_RUNTIME,
     tenant::metadata::TimelineMetadata,
+    tenant::upload_queue::{
+        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
+    },
     {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };
 
 use utils::id::{TenantId, TimelineId};
 
+use self::index::IndexPart;
+
+use super::storage_layer::LayerFileName;
+
+// Occasional network issues and such can cause remote operations to fail, and
+// that's expected. If a download fails, we log it at info-level, and retry.
+// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
+// level instead, as repeated failures can mean a more serious problem. If it
+// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
+const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+
+// Similarly log failed uploads and deletions at WARN level, after this many
+// retries. Uploads and deletions are retried forever, though.
+const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -256,209 +281,42 @@ pub struct RemoteTimelineClient {
 
     upload_queue: Mutex<UploadQueue>,
 
+    metrics: Arc<RemoteTimelineClientMetrics>,
+
     storage_impl: GenericRemoteStorage,
 }
 
-// clippy warns that Uninitialized is much smaller than Initialized, which wastes
-// memory for Uninitialized variants. Doesn't matter in practice, there are not
-// that many upload queues in a running pageserver, and most of them are initialized
-// anyway.
-#[allow(clippy::large_enum_variant)]
-enum UploadQueue {
-    Uninitialized,
-    Initialized(UploadQueueInitialized),
-    Stopped(UploadQueueStopped),
-}
-
-impl UploadQueue {
-    fn as_str(&self) -> &'static str {
-        match self {
-            UploadQueue::Uninitialized => "Uninitialized",
-            UploadQueue::Initialized(_) => "Initialized",
-            UploadQueue::Stopped(_) => "Stopped",
-        }
-    }
-}
-
-/// This keeps track of queued and in-progress tasks.
-struct UploadQueueInitialized {
-    /// Counter to assign task IDs
-    task_counter: u64,
-
-    /// All layer files stored in the remote storage, taking into account all
-    /// in-progress and queued operations
-    latest_files: HashMap<LayerFileName, LayerFileMetadata>,
-
-    /// Metadata stored in the remote storage, taking into account all
-    /// in-progress and queued operations.
-    /// DANGER: do not return to outside world, e.g., safekeepers.
-    latest_metadata: TimelineMetadata,
-
-    /// `disk_consistent_lsn` from the last metadata file that was successfully
-    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
-    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
-    /// Safekeeper can rely on it to make decisions for WAL storage.
-    last_uploaded_consistent_lsn: Lsn,
-
-    // Breakdown of different kinds of tasks currently in-progress
-    num_inprogress_layer_uploads: usize,
-    num_inprogress_metadata_uploads: usize,
-    num_inprogress_deletions: usize,
-
-    /// Tasks that are currently in-progress. In-progress means that a tokio Task
-    /// has been launched for it. An in-progress task can be busy uploading, but it can
-    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
-    /// be waiting for retry in `exponential_backoff`.
-    inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
-
-    /// Queued operations that have not been launched yet. They might depend on previous
-    /// tasks to finish. For example, metadata upload cannot be performed before all
-    /// preceding layer file uploads have completed.
-    queued_operations: VecDeque<UploadOp>,
-}
-
-struct UploadQueueStopped {
-    last_uploaded_consistent_lsn: Lsn,
-}
-
-impl UploadQueue {
-    fn initialize_empty_remote(
-        &mut self,
-        metadata: &TimelineMetadata,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        info!("initializing upload queue for empty remote");
-
-        let state = UploadQueueInitialized {
-            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: HashMap::new(),
-            latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialize_with_current_remote_index_part(
-        &mut self,
-        index_part: &IndexPart,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
-        for layer_name in &index_part.timeline_layers {
-            let layer_metadata = index_part
-                .layer_metadata
-                .get(layer_name)
-                .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(layer_name.to_owned(), layer_metadata);
-        }
-
-        let index_part_metadata = index_part.parse_metadata()?;
-        info!(
-            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part_metadata.disk_consistent_lsn()
-        );
-
-        let state = UploadQueueInitialized {
-            latest_files: files,
-            latest_metadata: index_part_metadata.clone(),
-            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => Ok(x),
-        }
-    }
-}
-
-/// An in-progress upload or delete task.
-#[derive(Debug)]
-struct UploadTask {
-    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
-    task_id: u64,
-    retries: AtomicU32,
-
-    op: UploadOp,
-}
-
-#[derive(Debug)]
-enum UploadOp {
-    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
-
-    /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
-
-    /// Delete a file.
-    Delete(RemoteOpFileKind, LayerFileName),
-
-    /// Barrier. When the barrier operation is reached,
-    Barrier(tokio::sync::watch::Sender<()>),
-}
-
-impl std::fmt::Display for UploadOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            UploadOp::UploadLayer(path, metadata) => {
-                write!(
-                    f,
-                    "UploadLayer({}, size={:?})",
-                    path.file_name(),
-                    metadata.file_size()
-                )
-            }
-            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
-            UploadOp::Barrier(_) => write!(f, "Barrier"),
-        }
-    }
-}
-
 impl RemoteTimelineClient {
+    ///
+    /// Create a remote storage client for given timeline
+    ///
+    /// Note: the caller must initialize the upload queue before any uploads can be scheduled,
+    /// by calling init_upload_queue.
+    ///
+    pub fn new(
+        remote_storage: GenericRemoteStorage,
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> RemoteTimelineClient {
+        RemoteTimelineClient {
+            conf,
+            runtime: &BACKGROUND_RUNTIME,
+            tenant_id,
+            timeline_id,
+            storage_impl: remote_storage,
+            upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
+        }
+    }
+
     /// Initialize the upload queue for a remote storage that already received
     /// an index file upload, i.e., it's not empty.
     /// The given `index_part` must be the one on the remote.
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        self.update_remote_physical_size_gauge(Some(index_part));
         Ok(())
     }
 
@@ -470,6 +328,7 @@ impl RemoteTimelineClient {
     ) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_empty_remote(local_metadata)?;
+        self.update_remote_physical_size_gauge(None);
         Ok(())
     }
 
@@ -481,6 +340,24 @@ impl RemoteTimelineClient {
         }
     }
 
+    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
+        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
+            current_remote_index_part
+                .layer_metadata
+                .values()
+                // If we don't have the file size for the layer, don't account for it in the metric.
+                .map(|ilmd| ilmd.file_size.unwrap_or(0))
+                .sum()
+        } else {
+            0
+        };
+        self.metrics.remote_physical_size_gauge().set(size);
+    }
+
+    pub fn get_remote_physical_size(&self) -> u64 {
+        self.metrics.remote_physical_size_gauge().get()
+    }
+
     //
     // Download operations.
     //
@@ -490,6 +367,10 @@ impl RemoteTimelineClient {
 
     /// Download index file
     pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self
+            .metrics
+            .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
+
         download::download_index_part(
             self.conf,
             &self.storage_impl,
@@ -501,6 +382,7 @@ impl RemoteTimelineClient {
             self.timeline_id,
             RemoteOpFileKind::Index,
             RemoteOpKind::Download,
+            Arc::clone(&self.metrics),
         )
         .await
     }
@@ -515,21 +397,27 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
-        let downloaded_size = download::download_layer_file(
-            self.conf,
-            &self.storage_impl,
-            self.tenant_id,
-            self.timeline_id,
-            layer_file_name,
-            layer_metadata,
-        )
-        .measure_remote_op(
-            self.tenant_id,
-            self.timeline_id,
-            RemoteOpFileKind::Layer,
-            RemoteOpKind::Download,
-        )
-        .await?;
+        let downloaded_size = {
+            let _unfinished_gauge_guard = self
+                .metrics
+                .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
+            download::download_layer_file(
+                self.conf,
+                &self.storage_impl,
+                self.tenant_id,
+                self.timeline_id,
+                layer_file_name,
+                layer_metadata,
+            )
+            .measure_remote_op(
+                self.tenant_id,
+                self.timeline_id,
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Download,
+                Arc::clone(&self.metrics),
+            )
+            .await?
+        };
 
         // Update the metadata for given layer file. The remote index file
         // might be missing some information for the file; this allows us
@@ -539,7 +427,17 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
             if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
-                upgraded.merge(&new_metadata);
+                if upgraded.merge(&new_metadata) {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                }
+                // If we don't do an index file upload inbetween here and restart,
+                // the value will go back down after pageserver restart, since we will
+                // have lost this data point.
+                // But, we upload index part fairly frequently, and restart pageserver rarely.
+                // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
+                self.metrics
+                    .remote_physical_size_gauge()
+                    .add(downloaded_size);
             } else {
                 // The file should exist, since we just downloaded it.
                 warn!(
@@ -556,14 +454,20 @@ impl RemoteTimelineClient {
     //
 
     ///
-    /// Launch an index-file upload operation in the background.
+    /// Launch an index-file upload operation in the background, with
+    /// updated metadata.
     ///
     /// The upload will be added to the queue immediately, but it
     /// won't be performed until all previosuly scheduled layer file
     /// upload operations have completed successfully.  This is to
     /// ensure that when the index file claims that layers X, Y and Z
-    /// exist in remote storage, they really do.
-    pub fn schedule_index_upload(
+    /// exist in remote storage, they really do. To wait for the upload
+    /// to complete, use `wait_completion`.
+    ///
+    /// If there were any changes to the list of files, i.e. if any
+    /// layer file uploads were scheduled, since the last index file
+    /// upload, those will be included too.
+    pub fn schedule_index_upload_for_metadata_update(
         self: &Arc<Self>,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
@@ -574,26 +478,60 @@ impl RemoteTimelineClient {
         // ahead of what's _actually_ on the remote during index upload.
         upload_queue.latest_metadata = metadata.clone();
 
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        self.schedule_index_upload(upload_queue, metadata_bytes);
+
+        Ok(())
+    }
+
+    ///
+    /// Launch an index-file upload operation in the background, if necessary.
+    ///
+    /// Use this function to schedule the update of the index file after
+    /// scheduling file uploads or deletions. If no file uploads or deletions
+    /// have been scheduled since the last index file upload, this does
+    /// nothing.
+    ///
+    /// Like schedule_index_upload_for_metadata_update(), this merely adds
+    /// the upload to the upload queue and returns quickly.
+    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+            self.schedule_index_upload(upload_queue, metadata_bytes);
+        }
+
+        Ok(())
+    }
+
+    /// Launch an index-file upload operation in the background (internal function)
+    fn schedule_index_upload(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        metadata_bytes: Vec<u8>,
+    ) {
+        info!(
+            "scheduling metadata upload with {} files ({} changed)",
+            upload_queue.latest_files.len(),
+            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
+        );
+
         let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
 
         let index_part = IndexPart::new(
             upload_queue.latest_files.clone(),
             disk_consistent_lsn,
-            upload_queue.latest_metadata.to_bytes()?,
+            metadata_bytes,
         );
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-        self.update_upload_queue_unfinished_metric(1, &op);
+        self.calls_unfinished_metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
-
-        info!(
-            "scheduled metadata upload with {} files",
-            upload_queue.latest_files.len()
-        );
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
-
-        Ok(())
     }
 
     ///
@@ -617,9 +555,10 @@ impl RemoteTimelineClient {
         upload_queue
             .latest_files
             .insert(layer_file_name.clone(), layer_metadata.clone());
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
-        self.update_upload_queue_unfinished_metric(1, &op);
+        self.calls_unfinished_metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
 
         info!(
@@ -635,8 +574,11 @@ impl RemoteTimelineClient {
     ///
     /// Launch a delete operation in the background.
     ///
-    /// The deletion won't actually be performed, until all preceding
-    /// upload operations have completed succesfully.
+    /// Note: This schedules an index file upload before the deletions.  The
+    /// deletion won't actually be performed, until any previously scheduled
+    /// upload operations, and the index file upload, have completed
+    /// succesfully.
+    ///
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
         names: &[LayerFileName],
@@ -647,7 +589,6 @@ impl RemoteTimelineClient {
         // Deleting layers doesn't affect the values stored in TimelineMetadata,
         // so we don't need update it. Just serialize it.
         let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
 
         // Update the remote index file, removing the to-be-deleted files from the index,
         // before deleting the actual files.
@@ -659,21 +600,17 @@ impl RemoteTimelineClient {
         let no_bail_here = || {
             for name in names {
                 upload_queue.latest_files.remove(name);
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
             }
 
-            let index_part = IndexPart::new(
-                upload_queue.latest_files.clone(),
-                disk_consistent_lsn,
-                metadata_bytes,
-            );
-            let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-            self.update_upload_queue_unfinished_metric(1, &op);
-            upload_queue.queued_operations.push_back(op);
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+                self.schedule_index_upload(upload_queue, metadata_bytes);
+            }
 
             // schedule the actual deletions
             for name in names {
                 let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
-                self.update_upload_queue_unfinished_metric(1, &op);
+                self.calls_unfinished_metric_begin(&op);
                 upload_queue.queued_operations.push_back(op);
                 info!("scheduled layer file deletion {}", name.file_name());
             }
@@ -825,7 +762,7 @@ impl RemoteTimelineClient {
             // upload finishes or times out soon enough.
             if task_mgr::is_shutdown_requested() {
                 info!("upload task cancelled by shutdown request");
-                self.update_upload_queue_unfinished_metric(-1, &task.op);
+                self.calls_unfinished_metric_end(&task.op);
                 self.stop();
                 return;
             }
@@ -847,11 +784,12 @@ impl RemoteTimelineClient {
                         self.timeline_id,
                         RemoteOpFileKind::Layer,
                         RemoteOpKind::Upload,
+                        Arc::clone(&self.metrics),
                     )
                     .await
                 }
                 UploadOp::UploadMetadata(ref index_part, _lsn) => {
-                    upload::upload_index_part(
+                    let res = upload::upload_index_part(
                         self.conf,
                         &self.storage_impl,
                         self.tenant_id,
@@ -863,8 +801,13 @@ impl RemoteTimelineClient {
                         self.timeline_id,
                         RemoteOpFileKind::Index,
                         RemoteOpKind::Upload,
+                        Arc::clone(&self.metrics),
                     )
-                    .await
+                    .await;
+                    if res.is_ok() {
+                        self.update_remote_physical_size_gauge(Some(index_part));
+                    }
+                    res
                 }
                 UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
                     let path = &self
@@ -877,6 +820,7 @@ impl RemoteTimelineClient {
                             self.timeline_id,
                             *metric_file_kind,
                             RemoteOpKind::Delete,
+                            Arc::clone(&self.metrics),
                         )
                         .await
                 }
@@ -895,12 +839,14 @@ impl RemoteTimelineClient {
                 Err(e) => {
                     let retries = task.retries.fetch_add(1, Ordering::SeqCst);
 
-                    // uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
-                    // such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
-                    // people and tests until the retries are definitely causing delays.
-                    if retries < 3 {
+                    // Uploads can fail due to rate limits (IAM, S3), spurious network problems,
+                    // or other external reasons. Such issues are relatively regular, so log them
+                    // at info level at first, and only WARN if the operation fails repeatedly.
+                    //
+                    // (See similar logic for downloads in `download::download_retry`)
+                    if retries < FAILED_UPLOAD_WARN_THRESHOLD {
                         info!(
-                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            "failed to perform remote task {}, will retry (attempt {}): {:#}",
                             task.op, retries, e
                         );
                     } else {
@@ -964,28 +910,40 @@ impl RemoteTimelineClient {
             // Launch any queued tasks that were unblocked by this one.
             self.launch_queued_tasks(upload_queue);
         }
-        self.update_upload_queue_unfinished_metric(-1, &task.op);
+        self.calls_unfinished_metric_end(&task.op);
     }
 
-    fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) {
-        let (file_kind, op_kind) = match op {
+    fn calls_unfinished_metric_impl(
+        &self,
+        op: &UploadOp,
+    ) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
+        let res = match op {
             UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
             UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
             UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
             UploadOp::Barrier(_) => {
                 // we do not account these
-                return;
+                return None;
             }
         };
-        REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
-            .get_metric_with_label_values(&[
-                &self.tenant_id.to_string(),
-                &self.timeline_id.to_string(),
-                file_kind.as_str(),
-                op_kind.as_str(),
-            ])
-            .unwrap()
-            .add(delta)
+        Some(res)
+    }
+
+    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        let guard = self.metrics.call_begin(&file_kind, &op_kind);
+        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
+    }
+
+    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        self.metrics.call_end(&file_kind, &op_kind);
     }
 
     fn stop(&self) {
@@ -1036,7 +994,7 @@ impl RemoteTimelineClient {
 
                 // Tear down queued ops
                 for op in qi.queued_operations.into_iter() {
-                    self.update_upload_queue_unfinished_metric(-1, &op);
+                    self.calls_unfinished_metric_end(&op);
                     // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                     // which is exactly what we want to happen.
                     drop(op);
@@ -1049,28 +1007,6 @@ impl RemoteTimelineClient {
     }
 }
 
-///
-/// Create a remote storage client for given timeline
-///
-/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
-/// by calling init_upload_queue.
-///
-pub fn create_remote_timeline_client(
-    remote_storage: GenericRemoteStorage,
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<RemoteTimelineClient> {
-    Ok(RemoteTimelineClient {
-        conf,
-        runtime: &BACKGROUND_RUNTIME,
-        tenant_id,
-        timeline_id,
-        storage_impl: remote_storage,
-        upload_queue: Mutex::new(UploadQueue::Uninitialized),
-    })
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1180,6 +1116,10 @@ mod tests {
             timeline_id: TIMELINE_ID,
             storage_impl,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                &harness.tenant_id,
+                &TIMELINE_ID,
+            )),
         });
 
         let remote_timeline_dir =
@@ -1211,15 +1151,19 @@ mod tests {
             assert!(upload_queue.queued_operations.is_empty());
             assert!(upload_queue.inprogress_tasks.len() == 2);
             assert!(upload_queue.num_inprogress_layer_uploads == 2);
+
+            // also check that `latest_file_changes` was updated
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
         }
 
         // Schedule upload of index. Check that it is queued
         let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload(&metadata)?;
+        client.schedule_index_upload_for_metadata_update(&metadata)?;
         {
             let mut guard = client.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut().unwrap();
             assert!(upload_queue.queued_operations.len() == 1);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
         }
 
         // Wait for the uploads to finish
@@ -1255,6 +1199,7 @@ mod tests {
             assert!(upload_queue.inprogress_tasks.len() == 1);
             assert!(upload_queue.num_inprogress_layer_uploads == 1);
             assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
         }
         assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);
 
diff --git a/pageserver/src/storage_sync2/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs
similarity index 100%
rename from pageserver/src/storage_sync2/delete.rs
rename to pageserver/src/tenant/remote_timeline_client/delete.rs
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
new file mode 100644
index 0000000000..2e79698087
--- /dev/null
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -0,0 +1,316 @@
+//! Helper functions to download files from remote storage with a RemoteStorage
+//!
+//! The functions in this module retry failed operations automatically, according
+//! to the FAILED_DOWNLOAD_RETRIES constant.
+
+use std::collections::HashSet;
+use std::future::Future;
+use std::path::Path;
+
+use anyhow::{anyhow, Context};
+use tokio::fs;
+use tokio::io::AsyncWriteExt;
+use tracing::{error, info, warn};
+
+use crate::config::PageServerConf;
+use crate::tenant::storage_layer::LayerFileName;
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
+use remote_storage::{DownloadError, GenericRemoteStorage};
+use utils::crashsafe::path_with_suffix_extension;
+use utils::id::{TenantId, TimelineId};
+
+use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
+use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
+
+async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
+    fs::File::open(path).await?.sync_all().await
+}
+
+///
+/// If 'metadata' is given, we will validate that the downloaded file's size matches that
+/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
+///
+/// Returns the size of the downloaded file.
+pub async fn download_layer_file<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    layer_file_name: &'a LayerFileName,
+    layer_metadata: &'a LayerFileMetadata,
+) -> Result<u64, DownloadError> {
+    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
+
+    let local_path = timeline_path.join(layer_file_name.file_name());
+
+    let remote_path = conf
+        .remote_path(&local_path)
+        .map_err(DownloadError::Other)?;
+
+    // Perform a rename inspired by durable_rename from file_utils.c.
+    // The sequence:
+    //     write(tmp)
+    //     fsync(tmp)
+    //     rename(tmp, new)
+    //     fsync(new)
+    //     fsync(parent)
+    // For more context about durable_rename check this email from postgres mailing list:
+    // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
+    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
+    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
+
+    let (mut destination_file, bytes_amount) = download_retry(
+        || async {
+            // TODO: this doesn't use the cached fd for some reason?
+            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
+                format!(
+                    "Failed to create a destination file for layer '{}'",
+                    temp_file_path.display()
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let mut download = storage.download(&remote_path).await.with_context(|| {
+                format!(
+                    "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+                format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok((destination_file, bytes_amount))
+        },
+        &format!("download {remote_path:?}"),
+    ).await?;
+
+    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+    // A file will not be closed immediately when it goes out of scope if there are any IO operations
+    // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+    // you should call flush before dropping it.
+    //
+    // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
+    // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
+    // But for additional safety lets check/wait for any pending operations.
+    destination_file
+        .flush()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to flush source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;
+
+    match layer_metadata.file_size() {
+        Some(expected) if expected != bytes_amount => {
+            return Err(DownloadError::Other(anyhow!(
+                "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
+                temp_file_path.display()
+            )));
+        }
+        Some(_) | None => {
+            // matches, or upgrading from an earlier IndexPart version
+        }
+    }
+
+    // not using sync_data because it can lose file size update
+    destination_file
+        .sync_all()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to fsync source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;
+    drop(destination_file);
+
+    fail::fail_point!("remote-storage-download-pre-rename", |_| {
+        Err(DownloadError::Other(anyhow!(
+            "remote-storage-download-pre-rename failpoint triggered"
+        )))
+    });
+
+    fs::rename(&temp_file_path, &local_path)
+        .await
+        .with_context(|| {
+            format!(
+                "Could not rename download layer file to {}",
+                local_path.display(),
+            )
+        })
+        .map_err(DownloadError::Other)?;
+
+    fsync_path(&local_path)
+        .await
+        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
+        .map_err(DownloadError::Other)?;
+
+    tracing::info!("download complete: {}", local_path.display());
+
+    Ok(bytes_amount)
+}
+
+const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
+
+pub fn is_temp_download_file(path: &Path) -> bool {
+    let extension = path.extension().map(|pname| {
+        pname
+            .to_str()
+            .expect("paths passed to this function must be valid Rust strings")
+    });
+    match extension {
+        Some(TEMP_DOWNLOAD_EXTENSION) => true,
+        Some(_) => false,
+        None => false,
+    }
+}
+
+/// List timelines of given tenant in remote storage
+pub async fn list_remote_timelines<'a>(
+    storage: &'a GenericRemoteStorage,
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+) -> anyhow::Result<HashSet<TimelineId>> {
+    let tenant_path = conf.timelines_path(&tenant_id);
+    let tenant_storage_path = conf.remote_path(&tenant_path)?;
+
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
+    let timelines = download_retry(
+        || storage.list_prefixes(Some(&tenant_storage_path)),
+        &format!("list prefixes for {tenant_path:?}"),
+    )
+    .await?;
+
+    if timelines.is_empty() {
+        anyhow::bail!("no timelines found on the remote storage")
+    }
+
+    let mut timeline_ids = HashSet::new();
+
+    for timeline_remote_storage_key in timelines {
+        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
+            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
+        })?;
+
+        let timeline_id: TimelineId = object_name.parse().with_context(|| {
+            format!("failed to parse object name into timeline id '{object_name}'")
+        })?;
+
+        // list_prefixes is assumed to return unique names. Ensure this here.
+        // NB: it's safer to bail out than warn-log this because the pageserver
+        //     needs to absolutely know about _all_ timelines that exist, so that
+        //     GC knows all the branchpoints. If we skipped over a timeline instead,
+        //     GC could delete a layer that's still needed by that timeline.
+        anyhow::ensure!(
+            !timeline_ids.contains(&timeline_id),
+            "list_prefixes contains duplicate timeline id {timeline_id}"
+        );
+        timeline_ids.insert(timeline_id);
+    }
+
+    Ok(timeline_ids)
+}
+
+pub(super) async fn download_index_part(
+    conf: &'static PageServerConf,
+    storage: &GenericRemoteStorage,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<IndexPart, DownloadError> {
+    let index_part_path = conf
+        .metadata_path(timeline_id, tenant_id)
+        .with_file_name(IndexPart::FILE_NAME);
+    let part_storage_path = conf
+        .remote_path(&index_part_path)
+        .map_err(DownloadError::BadInput)?;
+
+    let index_part_bytes = download_retry(
+        || async {
+            let mut index_part_download = storage.download(&part_storage_path).await?;
+
+            let mut index_part_bytes = Vec::new();
+            tokio::io::copy(
+                &mut index_part_download.download_stream,
+                &mut index_part_bytes,
+            )
+            .await
+            .with_context(|| {
+                format!("Failed to download an index part into file {index_part_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok(index_part_bytes)
+        },
+        &format!("download {part_storage_path:?}"),
+    )
+    .await?;
+
+    let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
+        .with_context(|| {
+            format!("Failed to deserialize index part file into file {index_part_path:?}")
+        })
+        .map_err(DownloadError::Other)?;
+
+    let index_part = index_part.remove_unclean_layer_file_names();
+
+    Ok(index_part)
+}
+
+///
+/// Helper function to handle retries for a download operation.
+///
+/// Remote operations can fail due to rate limits (IAM, S3), spurious network
+/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
+/// with backoff.
+///
+/// (See similar logic for uploads in `perform_upload_task`)
+async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+where
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, DownloadError>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
+                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
+                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(ref err)) => {
+                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
+                error!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
similarity index 94%
rename from pageserver/src/storage_sync2/index.rs
rename to pageserver/src/tenant/remote_timeline_client/index.rs
index 82487339ee..c199b7e10b 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use tracing::warn;
 
-use crate::tenant::{filename::LayerFileName, metadata::TimelineMetadata};
+use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName};
 
 use utils::lsn::Lsn;
 
@@ -48,9 +48,17 @@ impl LayerFileMetadata {
     /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
     /// other value.
     ///
-    /// This is called on the possibly outdated version.
-    pub fn merge(&mut self, other: &Self) {
-        self.file_size = other.file_size.or(self.file_size);
+    /// This is called on the possibly outdated version. Returns true if any changes
+    /// were made.
+    pub fn merge(&mut self, other: &Self) -> bool {
+        let mut changed = false;
+
+        if self.file_size != other.file_size {
+            self.file_size = other.file_size.or(self.file_size);
+            changed = true;
+        }
+
+        changed
     }
 }
 
@@ -75,11 +83,6 @@ where
     /// Additional metadata can might exist in `layer_metadata`.
     pub timeline_layers: HashSet<L>,
 
-    /// FIXME: unused field. This should be removed, but that changes the on-disk format,
-    /// so we need to make sure we're backwards-` (and maybe forwards-) compatible
-    /// First pass is to move it to Optional and the next would be its removal
-    missing_layers: Option<HashSet<L>>,
-
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -159,8 +162,6 @@ impl IndexPartUnclean {
         let IndexPartUnclean {
             version,
             timeline_layers,
-            // this is an unused field, ignore it on cleaning
-            missing_layers: _,
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
@@ -181,7 +182,6 @@ impl IndexPartUnclean {
                     }
                 })
                 .collect(),
-            missing_layers: None,
             layer_metadata: layer_metadata
                 .into_iter()
                 .filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
@@ -217,7 +217,6 @@ impl IndexPart {
         Self {
             version: Self::LATEST_VERSION,
             timeline_layers,
-            missing_layers: Some(HashSet::new()),
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
@@ -232,7 +231,7 @@ impl IndexPart {
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
-    file_size: Option<u64>,
+    pub(super) file_size: Option<u64>,
 }
 
 impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
@@ -251,7 +250,6 @@ mod tests {
     fn v0_indexpart_is_parsed() {
         let example = r#"{
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "disk_consistent_lsn":"0/16960E8",
             "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
         }"#;
@@ -259,7 +257,6 @@ mod tests {
         let expected = IndexPart {
             version: 0,
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            missing_layers: None, // disabled fields should not carry unused values further
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -275,7 +272,6 @@ mod tests {
         let example = r#"{
             "version":1,
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
                 "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
@@ -288,7 +284,6 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            missing_layers: None,
             layer_metadata: HashMap::from([
                 ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                     file_size: Some(25600000),
@@ -314,6 +309,7 @@ mod tests {
         let example = r#"{
             "version":1,
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
                 "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
@@ -338,7 +334,6 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            missing_layers: None,
         };
 
         let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
diff --git a/pageserver/src/storage_sync2/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
similarity index 96%
rename from pageserver/src/storage_sync2/upload.rs
rename to pageserver/src/tenant/remote_timeline_client/upload.rs
index 57a524a22d..5082fa1634 100644
--- a/pageserver/src/storage_sync2/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,12 +5,12 @@ use fail::fail_point;
 use std::path::Path;
 use tokio::fs;
 
-use super::index::IndexPart;
-use crate::config::PageServerConf;
-use crate::storage_sync::LayerFileMetadata;
+use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};
 
+use super::index::LayerFileMetadata;
+
 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
     conf: &'static PageServerConf,
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 24d9b2a10e..aa11985cbe 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use anyhow::Context;
+use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+
 use super::Tenant;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -67,6 +70,7 @@ pub(super) async fn gather_inputs(
 
     let timelines = tenant
         .refresh_gc_info()
+        .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
     if timelines.is_empty() {
@@ -93,8 +97,6 @@ pub(super) async fn gather_inputs(
     // used to determine the `retention_period` for the size model
     let mut max_cutoff_distance = None;
 
-    // this will probably conflict with on-demand downloaded layers, or at least force them all
-    // to be downloaded
     for timeline in timelines {
         let last_record_lsn = timeline.get_last_record_lsn();
 
@@ -212,11 +214,30 @@ pub(super) async fn gather_inputs(
     let mut have_any_error = false;
 
     while let Some(res) = joinset.join_next().await {
-        // each of these come with Result<Result<_, JoinError>, JoinError>
+        // each of these come with Result<anyhow::Result<_>, JoinError>
         // because of spawn + spawn_blocking
-        let res = res.and_then(|inner| inner);
         match res {
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures, nor should be");
+            }
+            Err(join_error) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
+                have_any_error = true;
+            }
+            Ok(Err(recv_result_error)) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("failed to receive logical size query result: {recv_result_error:#}");
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                 debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
 
                 logical_size_cache.insert((timeline.timeline_id, lsn), size);
@@ -228,21 +249,6 @@ pub(super) async fn gather_inputs(
                     command: Command::Update(size),
                 });
             }
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
-                warn!(
-                    timeline_id=%timeline.timeline_id,
-                    "failed to calculate logical size at {lsn}: {error:#}"
-                );
-                have_any_error = true;
-            }
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("we are not cancelling any of the futures, nor should be");
-            }
-            Err(join_error) => {
-                // cannot really do anything, as this panic is likely a bug
-                error!("logical size query panicked: {join_error:#}");
-                have_any_error = true;
-            }
         }
     }
 
@@ -351,7 +357,7 @@ enum LsnKind {
 struct TimelineAtLsnSizeResult(
     Arc<crate::tenant::Timeline>,
     utils::lsn::Lsn,
-    anyhow::Result<u64>,
+    Result<u64, CalculateLogicalSizeError>,
 );
 
 #[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
@@ -359,17 +365,15 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
-) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
-    let permit = tokio::sync::Semaphore::acquire_owned(limit)
+) -> Result<TimelineAtLsnSizeResult, RecvError> {
+    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
-    tokio::task::spawn_blocking(move || {
-        let _permit = permit;
-        let size_res = timeline.calculate_logical_size(lsn);
-        TimelineAtLsnSizeResult(timeline, lsn, size_res)
-    })
-    .await
+    let size_res = timeline
+        .spawn_ondemand_logical_size_calculation(lsn)
+        .await?;
+    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }
 
 #[test]
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3ad62587d3..d87a248bdf 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,6 +1,10 @@
-//!
 //! Common traits and structs for layers
-//!
+
+mod delta_layer;
+mod filename;
+mod image_layer;
+mod inmemory_layer;
+mod remote_layer;
 
 use crate::repository::{Key, Value};
 use crate::walrecord::NeonWalRecord;
@@ -8,13 +12,19 @@ use anyhow::Result;
 use bytes::Bytes;
 use std::ops::Range;
 use std::path::PathBuf;
+use std::sync::Arc;
 
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf};
+pub use image_layer::{ImageLayer, ImageLayerWriter};
+pub use inmemory_layer::InMemoryLayer;
+pub use remote_layer::RemoteLayer;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
     T: PartialOrd<T>,
@@ -116,6 +126,12 @@ pub trait Layer: Send + Sync {
     fn dump(&self, verbose: bool) -> Result<()>;
 }
 
+/// Returned by [`Layer::iter`]
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+
+/// Returned by [`Layer::key_iter`]
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -141,17 +157,42 @@ pub trait PersistentLayer: Layer {
     fn filename(&self) -> LayerFileName;
 
     // Path to the layer file in the local filesystem.
-    fn local_path(&self) -> PathBuf;
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<PathBuf>;
 
     /// Iterate through all keys and values stored in the layer
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
+    fn iter(&self) -> Result<LayerIter<'_>>;
 
     /// Iterate through all keys stored in the layer. Returns key, lsn and value size
     /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
         panic!("Not implemented")
     }
 
     /// Permanently remove this layer from disk.
     fn delete(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    /// Returns None if the layer file size is not known.
+    ///
+    /// Should not change over the lifetime of the layer object because
+    /// current_physical_size is computed as the som of this value.
+    fn file_size(&self) -> Option<u64>;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
 }
diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
similarity index 96%
rename from pageserver/src/tenant/delta_layer.rs
rename to pageserver/src/tenant/storage_layer/delta_layer.rs
index d8aaa3e8b9..302ba2dc78 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,7 +29,6 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{DeltaFileName, PathOrConf};
 use crate::tenant::storage_layer::{
     PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -39,7 +38,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
@@ -54,8 +53,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf};
 
 ///
 /// Header stored in the beginning of the file
@@ -183,6 +181,8 @@ pub struct DeltaLayer {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
 
+    pub file_size: u64,
+
     inner: RwLock<DeltaLayerInner>,
 }
 
@@ -387,32 +387,23 @@ impl PersistentLayer for DeltaLayer {
         self.layer_name().into()
     }
 
-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
     }
 
-    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaValueIter::new(inner) {
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        let inner = self.load().context("load delta layer")?;
+        Ok(match DeltaValueIter::new(inner) {
             Ok(iter) => Box::new(iter),
             Err(err) => Box::new(std::iter::once(Err(err))),
-        }
+        })
     }
 
-    fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaKeyIter::new(inner) {
-            Ok(iter) => Box::new(iter),
-            Err(e) => panic!("Layer index is corrupted: {e:?}"),
-        }
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        let inner = self.load()?;
+        Ok(Box::new(
+            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
+        ))
     }
 
     fn delete(&self) -> Result<()> {
@@ -420,6 +411,10 @@ impl PersistentLayer for DeltaLayer {
         fs::remove_file(self.path())?;
         Ok(())
     }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }
 
 impl DeltaLayer {
@@ -544,6 +539,7 @@ impl DeltaLayer {
         timeline_id: TimelineId,
         tenant_id: TenantId,
         filename: &DeltaFileName,
+        file_size: u64,
     ) -> DeltaLayer {
         DeltaLayer {
             path_or_conf: PathOrConf::Conf(conf),
@@ -551,6 +547,7 @@ impl DeltaLayer {
             tenant_id,
             key_range: filename.key_range.clone(),
             lsn_range: filename.lsn_range.clone(),
+            file_size,
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
@@ -563,21 +560,23 @@ impl DeltaLayer {
     /// Create a DeltaLayer struct representing an existing file on disk.
     ///
     /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
-    where
-        F: FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
         file.read_exact_at(&mut summary_buf, 0)?;
         let summary = Summary::des_prefix(&summary_buf)?;
 
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
         Ok(DeltaLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
             timeline_id: summary.timeline_id,
             tenant_id: summary.tenant_id,
             key_range: summary.key_range,
             lsn_range: summary.lsn_range,
+            file_size: metadata.len(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
@@ -734,6 +733,10 @@ impl DeltaLayerWriterInner {
         file.seek(SeekFrom::Start(0))?;
         Summary::ser_into(&summary, &mut file)?;
 
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
         // Note: Because we opened the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -743,6 +746,7 @@ impl DeltaLayerWriterInner {
             timeline_id: self.timeline_id,
             key_range: self.key_start..key_end,
             lsn_range: self.lsn_range.clone(),
+            file_size: metadata.len(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs
similarity index 100%
rename from pageserver/src/tenant/filename.rs
rename to pageserver/src/tenant/storage_layer/filename.rs
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
similarity index 95%
rename from pageserver/src/tenant/image_layer.rs
rename to pageserver/src/tenant/storage_layer/image_layer.rs
index e08e938a4f..9a26fce73b 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -21,11 +21,10 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::page_cache::PAGE_SZ;
-use crate::repository::{Key, Value, KEY_SIZE};
+use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{ImageFileName, PathOrConf};
 use crate::tenant::storage_layer::{
     PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -36,10 +35,11 @@ use bytes::Bytes;
 use hex;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::Write;
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
+use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;
@@ -50,8 +50,8 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::filename::{ImageFileName, LayerFileName, PathOrConf};
+use super::{Layer, LayerIter};
 
 ///
 /// Header stored in the beginning of the file
@@ -105,6 +105,7 @@ pub struct ImageLayer {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
     pub key_range: Range<Key>,
+    pub file_size: u64,
 
     // This entry contains an image of all pages as of this LSN
     pub lsn: Lsn,
@@ -208,8 +209,8 @@ impl PersistentLayer for ImageLayer {
         self.layer_name().into()
     }
 
-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
     }
 
     fn get_tenant_id(&self) -> TenantId {
@@ -219,7 +220,7 @@ impl PersistentLayer for ImageLayer {
     fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+    fn iter(&self) -> Result<LayerIter<'_>> {
         unimplemented!();
     }
 
@@ -228,6 +229,10 @@ impl PersistentLayer for ImageLayer {
         fs::remove_file(self.path())?;
         Ok(())
     }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }
 
 impl ImageLayer {
@@ -344,6 +349,7 @@ impl ImageLayer {
         timeline_id: TimelineId,
         tenant_id: TenantId,
         filename: &ImageFileName,
+        file_size: u64,
     ) -> ImageLayer {
         ImageLayer {
             path_or_conf: PathOrConf::Conf(conf),
@@ -351,6 +357,7 @@ impl ImageLayer {
             tenant_id,
             key_range: filename.key_range.clone(),
             lsn: filename.lsn,
+            file_size,
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
                 file: None,
@@ -363,21 +370,21 @@ impl ImageLayer {
     /// Create an ImageLayer struct representing an existing file on disk.
     ///
     /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
-    where
-        F: std::os::unix::prelude::FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
         file.read_exact_at(&mut summary_buf, 0)?;
         let summary = Summary::des_prefix(&summary_buf)?;
-
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
         Ok(ImageLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
             timeline_id: summary.timeline_id,
             tenant_id: summary.tenant_id,
             key_range: summary.key_range,
             lsn: summary.lsn,
+            file_size: metadata.len(),
             inner: RwLock::new(ImageLayerInner {
                 file: None,
                 loaded: false,
@@ -523,6 +530,10 @@ impl ImageLayerWriterInner {
         file.seek(SeekFrom::Start(0))?;
         Summary::ser_into(&summary, &mut file)?;
 
+        let metadata = file
+            .metadata()
+            .context("get metadata to determine file size")?;
+
         // Note: Because we open the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -532,6 +543,7 @@ impl ImageLayerWriterInner {
             tenant_id: self.tenant_id,
             key_range: self.key_range.clone(),
             lsn: self.lsn,
+            file_size: metadata.len(),
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
                 file: None,
@@ -556,7 +568,7 @@ impl ImageLayerWriterInner {
                 lsn: self.lsn,
             },
         );
-        std::fs::rename(self.path, &final_path)?;
+        std::fs::rename(self.path, final_path)?;
 
         trace!("created image layer {}", layer.path().display());
 
diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
similarity index 99%
rename from pageserver/src/tenant/inmemory_layer.rs
rename to pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 8f64281cb1..93356a9d8c 100644
--- a/pageserver/src/tenant/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -8,7 +8,6 @@ use crate::config::PageServerConf;
 use crate::repository::{Key, Value};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
-use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
@@ -28,7 +27,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use std::sync::RwLock;
 
-use super::storage_layer::Layer;
+use super::{DeltaLayer, DeltaLayerWriter, Layer};
 
 thread_local! {
     /// A buffer for serializing object during [`InMemoryLayer::put_value`].
@@ -97,6 +96,7 @@ impl Layer for InMemoryLayer {
         };
         self.start_lsn..end_lsn
     }
+
     fn is_incremental(&self) -> bool {
         // in-memory layer is always considered incremental.
         true
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
new file mode 100644
index 0000000000..33474bb4a2
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -0,0 +1,210 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use anyhow::{bail, Result};
+use std::ops::Range;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
+use super::image_layer::ImageLayer;
+use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer};
+
+#[derive(Debug)]
+pub struct RemoteLayer {
+    tenantid: TenantId,
+    timelineid: TimelineId,
+    key_range: Range<Key>,
+    lsn_range: Range<Lsn>,
+
+    pub file_name: LayerFileName,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    is_delta: bool,
+
+    is_incremental: bool,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+}
+
+impl Layer for RemoteLayer {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        bail!(
+            "layer {} needs to be downloaded",
+            self.filename().file_name()
+        );
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    /// debugging function to print out the contents of the layer
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        println!(
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenantid,
+            self.timelineid,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
+        );
+
+        Ok(())
+    }
+
+    fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn get_tenant_id(&self) -> TenantId {
+        self.tenantid
+    }
+
+    fn get_timeline_id(&self) -> TimelineId {
+        self.timelineid
+    }
+
+    fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            }
+            .into()
+        } else {
+            ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            }
+            .into()
+        }
+    }
+
+    fn local_path(&self) -> Option<PathBuf> {
+        None
+    }
+
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn delete(&self) -> Result<()> {
+        Ok(())
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn file_size(&self) -> Option<u64> {
+        self.layer_metadata.file_size()
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn..(fname.lsn + 1),
+            is_delta: false,
+            is_incremental: false,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn_range.clone(),
+            is_delta: true,
+            is_incremental: true,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub fn create_downloaded_layer(
+        &self,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.is_delta {
+            let fname = DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            };
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        } else {
+            let fname = ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            };
+            Arc::new(ImageLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        }
+    }
+}
diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant/tasks.rs
similarity index 97%
rename from pageserver/src/tenant_tasks.rs
rename to pageserver/src/tenant/tasks.rs
index d3aec933c2..8397d26e5d 100644
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,8 +8,8 @@ use std::time::Duration;
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_mgr;
 use tracing::*;
 use utils::id::TenantId;
 
@@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) {
             } else {
                 // Run gc
                 if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
                     {
                         sleep_duration = wait_duration;
                         error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
     wait: Duration,
 ) -> ControlFlow<(), Arc<Tenant>> {
     let tenant = loop {
-        match tenant_mgr::get_tenant(tenant_id, false).await {
+        match mgr::get_tenant(tenant_id, false).await {
             Ok(tenant) => break tenant,
             Err(e) => {
                 error!("Failed to get a tenant {tenant_id}: {e:#}");
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cd045d1081..0d8a5fc800 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,11 +3,15 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pageserver_api::models::TimelineState;
-use tokio::sync::watch;
-use tokio::task::spawn_blocking;
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState,
+};
+use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use std::cmp::{max, min, Ordering};
@@ -15,18 +19,17 @@ use std::collections::HashMap;
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock};
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::storage_sync::index::IndexPart;
-use crate::storage_sync::RemoteTimelineClient;
+use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
+use crate::tenant::storage_layer::{
+    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
+    RemoteLayer,
+};
 use crate::tenant::{
-    delta_layer::{DeltaLayer, DeltaLayerWriter},
     ephemeral_file::is_ephemeral_file,
-    filename::{DeltaFileName, ImageFileName},
-    image_layer::{ImageLayer, ImageLayerWriter},
-    inmemory_layer::InMemoryLayer,
     layer_map::{LayerMap, SearchResult},
     metadata::{save_metadata, TimelineMetadata},
     par_fsync,
@@ -36,10 +39,10 @@ use crate::tenant::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::TimelineMetrics;
-use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::tenant_config::TenantConfOpt;
+use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
+use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 
 use postgres_connection::PgConnectionConfig;
@@ -51,19 +54,19 @@ use utils::{
     simple_rcu::{Rcu, RcuReadGuard},
 };
 
+use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
-use crate::CheckpointConfig;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
-use crate::{page_cache, storage_sync::index::LayerFileMetadata};
 
-use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::RemoteTimelineClient;
+use super::storage_layer::{DeltaLayer, ImageLayer, Layer};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum FlushLoopState {
@@ -76,6 +79,8 @@ pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<TenantConfOpt>>,
 
+    myself: Weak<Self>,
+
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
 
@@ -91,10 +96,7 @@ pub struct Timeline {
     walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
 
     /// Remote storage client.
-    ///
-    /// If Some, use it to upload all newly created layers to the remote storage,
-    /// and keep remote metadata file in sync. In the future, also use it to download
-    /// layer files on-demand.
+    /// See [`storage_sync`] module comment for details.
     pub remote_client: Option<Arc<RemoteTimelineClient>>,
 
     // What page versions do we hold in the repository? If we get a
@@ -158,7 +160,7 @@ pub struct Timeline {
 
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
-    pub gc_info: RwLock<GcInfo>,
+    pub gc_info: std::sync::RwLock<GcInfo>,
 
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
@@ -176,7 +178,6 @@ pub struct Timeline {
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
-    initial_size_computation_started: AtomicBool,
 
     /// Information about the last processed message by the WAL receiver,
     /// or None if WAL receiver has not received anything for this timeline
@@ -186,6 +187,8 @@ pub struct Timeline {
     /// Relation size cache
     pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
 
+    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
+
     state: watch::Sender<TimelineState>,
 }
 
@@ -202,6 +205,8 @@ struct LogicalSize {
     ///
     /// NOTE: initial size is not a constant and will change between restarts.
     initial_logical_size: OnceCell<u64>,
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    initial_size_computation: Arc<tokio::sync::Semaphore>,
     /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
     initial_part_end: Option<Lsn>,
     /// All other size changes after startup, combined together.
@@ -252,6 +257,8 @@ impl LogicalSize {
     fn empty_initial() -> Self {
         Self {
             initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
             initial_part_end: None,
             size_added_after_initial: AtomicI64::new(0),
         }
@@ -260,6 +267,7 @@ impl LogicalSize {
     fn deferred_initial(compute_to: Lsn) -> Self {
         Self {
             initial_logical_size: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
             initial_part_end: Some(compute_to),
             size_added_after_initial: AtomicI64::new(0),
         }
@@ -302,12 +310,68 @@ impl LogicalSize {
     }
 }
 
+/// Returned by [`Timeline::layer_size_sum`]
+pub enum LayerSizeSum {
+    /// The result is accurate.
+    Accurate(u64),
+    // We don't know the layer file size of one or more layers.
+    // They contribute to the sum with a value of 0.
+    // Hence, the sum is a lower bound for the actualy layer file size sum.
+    ApproximateLowerBound(u64),
+}
+
+impl LayerSizeSum {
+    pub fn approximate_is_ok(self) -> u64 {
+        match self {
+            LayerSizeSum::Accurate(v) => v,
+            LayerSizeSum::ApproximateLowerBound(v) => v,
+        }
+    }
+}
+
 pub struct WalReceiverInfo {
     pub wal_source_connconf: PgConnectionConfig,
     pub last_received_msg_lsn: Lsn,
     pub last_received_msg_ts: u128,
 }
 
+/// Like `?`, but for [`PageReconstructResult`].
+/// Use it to bubble up the `NeedsDownload` and `Error` to the caller.
+///
+/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
+#[macro_export]
+macro_rules! try_no_ondemand_download {
+    ($result:expr) => {{
+        let result = $result;
+        match result {
+            PageReconstructResult::Success(value) => value,
+            PageReconstructResult::NeedsDownload(timeline, layer) => {
+                return PageReconstructResult::NeedsDownload(timeline, layer);
+            }
+            PageReconstructResult::Error(e) => return PageReconstructResult::Error(e),
+        }
+    }};
+}
+
+/// Replacement for `?` in functions that return [`PageReconstructResult`].
+///
+/// Given an `expr: Result<T, E>`, use `try_page_reconstruct_result!(expr)`
+/// instead of `(expr)?`.
+/// If `expr` is `Ok(v)`, the macro evaluates to `v`.
+/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`.
+///
+/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
+#[macro_export]
+macro_rules! try_page_reconstruct_result {
+    ($result:expr) => {{
+        let result = $result;
+        match result {
+            Ok(v) => v,
+            Err(e) => return PageReconstructResult::from(e),
+        }
+    }};
+}
+
 ///
 /// Information about how much history needs to be retained, needed by
 /// Garbage Collection.
@@ -337,6 +401,77 @@ pub struct GcInfo {
     pub pitr_cutoff: Lsn,
 }
 
+pub enum PageReconstructResult<T> {
+    Success(T),
+    /// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map
+    /// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then
+    /// retry the operation that returned this error.
+    NeedsDownload(Weak<Timeline>, Weak<RemoteLayer>),
+    Error(PageReconstructError),
+}
+
+/// An error happened in a get() operation.
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
+
+    #[error(transparent)]
+    WalRedo(#[from] crate::walredo::WalRedoError),
+}
+
+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+/// This impl makes it so you can substitute return type
+/// `Result<T, E>` with `PageReconstructError<T>` in functions
+/// and existing `?` will generally continue to work.
+/// The reason why  thanks to
+/// anyhow::Error that `(some error type)ensures that exis
+impl<E, T> From<E> for PageReconstructResult<T>
+where
+    E: Into<PageReconstructError>,
+{
+    fn from(e: E) -> Self {
+        Self::Error(e.into())
+    }
+}
+
+impl<T> PageReconstructResult<T> {
+    /// Treat the need for on-demand download as an error.
+    ///
+    /// **Avoid this function in new code** if you can help it,
+    /// as on-demand download will become the norm in the future,
+    /// especially once we implement layer file eviction.
+    ///
+    /// If you are in an async function, use [`with_ondemand_download`]
+    /// to do the download right here.
+    ///
+    /// If you are in a sync function, change its return type from
+    /// `Result<T, E>` to `PageReconstructResult<T>` and bubble up
+    /// the non-success cases of `PageReconstructResult<T>` to the caller.
+    /// This gives them a chance to do the download and retry.
+    /// Consider using [`try_no_ondemand_download`] for convenience.
+    ///
+    /// For more background, read the comment on [`with_ondemand_download`].
+    pub fn no_ondemand_download(self) -> anyhow::Result<T> {
+        match self {
+            PageReconstructResult::Success(value) => Ok(value),
+            // TODO print more info about the timeline
+            PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"),
+            PageReconstructResult::Error(e) => {
+                Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
+            }
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -364,8 +499,10 @@ impl Timeline {
     /// the Repository implementation may incorrectly return a value from an ancestor
     /// branch, for example, or waste a lot of cycles chasing the non-existing key.
     ///
-    pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result<Bytes> {
-        anyhow::ensure!(lsn.is_valid(), "Invalid LSN");
+    pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult<Bytes> {
+        if !lsn.is_valid() {
+            return PageReconstructResult::from(anyhow!("Invalid LSN"));
+        }
 
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
@@ -375,7 +512,7 @@ impl Timeline {
             Some((cached_lsn, cached_img)) => {
                 match cached_lsn.cmp(&lsn) {
                     Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image
                     Ordering::Greater => {
                         unreachable!("the returned lsn should never be after the requested lsn")
                     }
@@ -390,13 +527,18 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?;
+        try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state));
 
         self.metrics
             .reconstruct_time_histo
             .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
     }
 
+    // Like get(), but if a remote layer file is needed, it is downloaded as part of this call.
+    pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result<Bytes> {
+        with_ondemand_download(|| self.get(key, lsn)).await
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -423,30 +565,27 @@ impl Timeline {
         }
     }
 
-    /// Get the physical size of the timeline at the latest LSN
-    pub fn get_physical_size(&self) -> u64 {
-        self.metrics.current_physical_size_gauge.get()
+    /// The sum of the file size of all historic layers in the layer map.
+    /// This method makes no distinction between local and remote layers.
+    /// Hence, the result **does not represent local filesystem usage**.
+    pub fn layer_size_sum(&self) -> LayerSizeSum {
+        let layer_map = self.layers.read().unwrap();
+        let mut size = 0;
+        let mut no_size_cnt = 0;
+        for l in layer_map.iter_historic_layers() {
+            let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1));
+            size += l_size;
+            no_size_cnt += l_no_size;
+        }
+        if no_size_cnt == 0 {
+            LayerSizeSum::Accurate(size)
+        } else {
+            LayerSizeSum::ApproximateLowerBound(size)
+        }
     }
 
-    /// Get the physical size of the timeline at the latest LSN non incrementally
-    pub fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
-        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
-        // total size of layer files in the current timeline directory
-        let mut total_physical_size = 0;
-
-        for direntry in fs::read_dir(timeline_path)? {
-            let direntry = direntry?;
-            let fname = direntry.file_name();
-            let fname = fname.to_string_lossy();
-
-            if ImageFileName::parse_str(&fname).is_some()
-                || DeltaFileName::parse_str(&fname).is_some()
-            {
-                total_physical_size += direntry.metadata()?.len();
-            }
-        }
-
-        Ok(total_physical_size)
+    pub fn get_resident_physical_size(&self) -> u64 {
+        self.metrics.resident_physical_size_gauge.get()
     }
 
     ///
@@ -494,22 +633,10 @@ impl Timeline {
     }
 
     /// Flush to disk all data that was written with the put_* functions
-    ///
-    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
-    /// know anything about them here in the repository.
     #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
-    pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
-        match cconf {
-            CheckpointConfig::Flush => {
-                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await
-            }
-            CheckpointConfig::Forced => {
-                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await?;
-                self.compact().await
-            }
-        }
+    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+        self.freeze_inmem_layer(false);
+        self.flush_frozen_layers_and_wait().await
     }
 
     pub async fn compact(&self) -> anyhow::Result<()> {
@@ -566,14 +693,18 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
-        match self.repartition(
-            self.get_last_record_lsn(),
-            self.get_compaction_target_size(),
-        ) {
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+            )
+            .await
+        {
             Ok((partitioning, lsn)) => {
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
-                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
+                let layer_paths_to_upload =
+                    self.create_image_layers(&partitioning, lsn, false).await?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -584,6 +715,18 @@ impl Timeline {
                 let timer = self.metrics.compact_time_histo.start_timer();
                 self.compact_level0(target_file_size).await?;
                 timer.stop_and_record();
+
+                // If `create_image_layers' or `compact_level0` scheduled any
+                // uploads or deletions, but didn't update the index file yet,
+                // do it now.
+                //
+                // This isn't necessary for correctness, the remote state is
+                // consistent without the uploads and deletions, and we would
+                // update the index file on next flush iteration too. But it
+                // could take a while until that happens.
+                if let Some(remote_client) = &self.remote_client {
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -609,18 +752,22 @@ impl Timeline {
     ///
     /// The size could be lagging behind the actual number, in case
     /// the initial size calculation has not been run (gets triggered on the first size access).
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
+    ///
+    /// return size and boolean flag that shows if the size is exact
+    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
         let current_size = self.current_logical_size.current_size()?;
         debug!("Current size: {current_size:?}");
 
+        let mut is_exact = true;
         let size = current_size.size();
         if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
             (current_size, self.current_logical_size.initial_part_end)
         {
+            is_exact = false;
             self.try_spawn_size_init_task(init_lsn);
         }
 
-        Ok(size)
+        Ok((size, is_exact))
     }
 
     /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
@@ -744,76 +891,81 @@ impl Timeline {
         walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
         remote_client: Option<RemoteTimelineClient>,
         pg_version: u32,
-    ) -> Self {
+    ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
         let (state, _) = watch::channel(TimelineState::Suspended);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
-        let mut result = Timeline {
-            conf,
-            tenant_conf,
-            timeline_id,
-            tenant_id,
-            pg_version,
-            layers: RwLock::new(LayerMap::default()),
+        Arc::new_cyclic(|myself| {
+            let mut result = Timeline {
+                conf,
+                tenant_conf,
+                myself: myself.clone(),
+                timeline_id,
+                tenant_id,
+                pg_version,
+                layers: RwLock::new(LayerMap::default()),
 
-            walredo_mgr,
+                walredo_mgr,
 
-            remote_client: remote_client.map(Arc::new),
+                remote_client: remote_client.map(Arc::new),
 
-            // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
-            last_record_lsn: SeqWait::new(RecordLsn {
-                last: disk_consistent_lsn,
-                prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
-            }),
-            disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),
+                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
+                last_record_lsn: SeqWait::new(RecordLsn {
+                    last: disk_consistent_lsn,
+                    prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
+                }),
+                disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),
 
-            last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
-            last_freeze_ts: RwLock::new(Instant::now()),
+                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
+                last_freeze_ts: RwLock::new(Instant::now()),
 
-            ancestor_timeline: ancestor,
-            ancestor_lsn: metadata.ancestor_lsn(),
+                ancestor_timeline: ancestor,
+                ancestor_lsn: metadata.ancestor_lsn(),
 
-            metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
+                metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
 
-            flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
+                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
-            layer_flush_start_tx,
-            layer_flush_done_tx,
+                layer_flush_start_tx,
+                layer_flush_done_tx,
 
-            write_lock: Mutex::new(()),
-            layer_removal_cs: Default::default(),
+                write_lock: Mutex::new(()),
+                layer_removal_cs: Default::default(),
 
-            gc_info: RwLock::new(GcInfo {
-                retain_lsns: Vec::new(),
-                horizon_cutoff: Lsn(0),
-                pitr_cutoff: Lsn(0),
-            }),
+                gc_info: std::sync::RwLock::new(GcInfo {
+                    retain_lsns: Vec::new(),
+                    horizon_cutoff: Lsn(0),
+                    pitr_cutoff: Lsn(0),
+                }),
 
-            latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
-            initdb_lsn: metadata.initdb_lsn(),
+                latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
+                initdb_lsn: metadata.initdb_lsn(),
 
-            current_logical_size: if disk_consistent_lsn.is_valid() {
-                // we're creating timeline data with some layer files existing locally,
-                // need to recalculate timeline's logical size based on data in the layers.
-                LogicalSize::deferred_initial(disk_consistent_lsn)
-            } else {
-                // we're creating timeline data without any layers existing locally,
-                // initial logical size is 0.
-                LogicalSize::empty_initial()
-            },
-            initial_size_computation_started: AtomicBool::new(false),
-            partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
-            repartition_threshold: 0,
+                current_logical_size: if disk_consistent_lsn.is_valid() {
+                    // we're creating timeline data with some layer files existing locally,
+                    // need to recalculate timeline's logical size based on data in the layers.
+                    LogicalSize::deferred_initial(disk_consistent_lsn)
+                } else {
+                    // we're creating timeline data without any layers existing locally,
+                    // initial logical size is 0.
+                    LogicalSize::empty_initial()
+                },
+                partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                repartition_threshold: 0,
 
-            last_received_wal: Mutex::new(None),
-            rel_size_cache: RwLock::new(HashMap::new()),
-            state,
-        };
-        result.repartition_threshold = result.get_checkpoint_distance() / 10;
-        result
+                last_received_wal: Mutex::new(None),
+                rel_size_cache: RwLock::new(HashMap::new()),
+
+                download_all_remote_layers_task_info: RwLock::new(None),
+
+                state,
+            };
+            result.repartition_threshold = result.get_checkpoint_distance() / 10;
+            result
+        })
     }
 
     pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
@@ -927,11 +1079,18 @@ impl Timeline {
                     continue;
                 }
 
-                let layer =
-                    ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
+                let file_size = direntry_path.metadata()?.len();
+
+                let layer = ImageLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &imgfilename,
+                    file_size,
+                );
 
                 trace!("found layer {}", layer.path().display());
-                total_physical_size += layer.path().metadata()?.len();
+                total_physical_size += file_size;
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
@@ -951,16 +1110,23 @@ impl Timeline {
                     continue;
                 }
 
-                let layer =
-                    DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
+                let file_size = direntry_path.metadata()?.len();
+
+                let layer = DeltaLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &deltafilename,
+                    file_size,
+                );
 
                 trace!("found layer {}", layer.path().display());
-                total_physical_size += layer.path().metadata()?.len();
+                total_physical_size += file_size;
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
-            } else if crate::storage_sync::is_temp_download_file(&direntry_path) {
+            } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
                 info!(
                     "skipping temp download file, reconcile_with_remote will resume / clean up: {}",
                     fname
@@ -989,7 +1155,7 @@ impl Timeline {
             num_layers, disk_consistent_lsn, total_physical_size
         );
         self.metrics
-            .current_physical_size_gauge
+            .resident_physical_size_gauge
             .set(total_physical_size);
 
         timer.stop_and_record();
@@ -997,21 +1163,14 @@ impl Timeline {
         Ok(())
     }
 
-    async fn download_missing(
+    async fn create_remote_layers(
         &self,
         index_part: &IndexPart,
-        remote_client: &RemoteTimelineClient,
         local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
         up_to_date_disk_consistent_lsn: Lsn,
     ) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
         // Are we missing some files that are present in remote storage?
-        // Download them now.
-        // TODO Downloading many files this way is not efficient.
-        //     Better to use FuturesUnordered. Maybe keep as is because:
-        //    a) inplace download is a throw-away code, on-demand patch doesnt need that
-        //    b) typical case now is that there is nothing to sync, this downloads a lot
-        //       1) if there was another pageserver that came and generated new files
-        //       2) during attach of a timeline with big history which we currently do not do
+        // Create RemoteLayer instances for them.
         let mut local_only_layers = local_layers;
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1023,11 +1182,13 @@ impl Timeline {
                 .unwrap_or(LayerFileMetadata::MISSING);
 
             // Is the local layer's size different from the size stored in the
-            // remote index file? If so, rename_to_backup those files & remove
-            // local_layer form the layer map.
-            // We'll download a fresh copy of the layer file below.
+            // remote index file?
+            // If so, rename_to_backup those files & replace their local layer with
+            // a RemoteLayer in the layer map so that we re-download them on-demand.
             if let Some(local_layer) = local_layer {
-                let local_layer_path = local_layer.local_path();
+                let local_layer_path = local_layer
+                    .local_path()
+                    .expect("caller must ensure that local_layers only contains local layers");
                 ensure!(
                     local_layer_path.exists(),
                     "every layer from local_layers must exist on disk: {}",
@@ -1048,7 +1209,7 @@ impl Timeline {
                             assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
                             anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                         } else {
-                            self.metrics.current_physical_size_gauge.sub(local_size);
+                            self.metrics.resident_physical_size_gauge.sub(local_size);
                             self.layers.write().unwrap().remove_historic(local_layer);
                             // fall-through to adding the remote layer
                         }
@@ -1069,7 +1230,7 @@ impl Timeline {
             }
 
             info!(
-                "remote layer does not exist locally, downloading it now: {}",
+                "remote layer does not exist locally, creating remote layer: {}",
                 remote_layer_name.file_name()
             );
 
@@ -1083,28 +1244,18 @@ impl Timeline {
                         continue;
                     }
 
-                    trace!("downloading image file: {remote_layer_name:?}");
-                    let downloaded_size = remote_client
-                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
-                        .await
-                        .with_context(|| {
-                            format!("failed to download image layer {remote_layer_name:?}")
-                        })?;
-                    trace!("done");
+                    let remote_layer = RemoteLayer::new_img(
+                        self.tenant_id,
+                        self.timeline_id,
+                        imgfilename,
+                        &remote_layer_metadata,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
 
-                    let image_layer =
-                        ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename);
-
-                    self.layers
-                        .write()
-                        .unwrap()
-                        .insert_historic(Arc::new(image_layer));
-                    self.metrics
-                        .current_physical_size_gauge
-                        .add(downloaded_size);
+                    self.layers.write().unwrap().insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
-                    // Create a DeltaLayer struct for each delta file.
+                    // Create a RemoteLayer for the delta file.
                     // The end-LSN is exclusive, while disk_consistent_lsn is
                     // inclusive. For example, if disk_consistent_lsn is 100, it is
                     // OK for a delta layer to have end LSN 101, but if the end LSN
@@ -1112,29 +1263,19 @@ impl Timeline {
                     // before crash.
                     if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
                         warn!(
-                        "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
-                        deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
-                    );
+                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
+                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
+                        );
                         continue;
                     }
-
-                    trace!("downloading delta file: {remote_layer_name:?}");
-                    let sz = remote_client
-                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
-                        .await
-                        .with_context(|| {
-                            format!("failed to download delta layer {remote_layer_name:?}")
-                        })?;
-                    trace!("done");
-
-                    let delta_layer =
-                        DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename);
-
-                    self.layers
-                        .write()
-                        .unwrap()
-                        .insert_historic(Arc::new(delta_layer));
-                    self.metrics.current_physical_size_gauge.add(sz);
+                    let remote_layer = RemoteLayer::new_delta(
+                        self.tenant_id,
+                        self.timeline_id,
+                        deltafilename,
+                        &remote_layer_metadata,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
+                    self.layers.write().unwrap().insert_historic(remote_layer);
                 }
                 #[cfg(test)]
                 LayerFileName::Test(_) => unreachable!(),
@@ -1144,22 +1285,22 @@ impl Timeline {
         Ok(local_only_layers)
     }
 
+    /// This function will synchronize local state with what we have in remote storage.
     ///
-    /// This function will synchronize local data with what we have in remote storage.
-    /// 1. It will download missing layer files.
-    /// 2. It will update local metadata if remote one has greater `disk_consistent_lsn`.
-    /// 3. It will upload files that are missing on the remote
-    /// 4. It will update index file on the remote accordingly
-    /// TODO may be a bit cleaner to do things based on populated remote client,
-    ///     and then do things based on its upload_queue.latest_files
+    /// Steps taken:
+    /// 1. Initialize upload queue based on `index_part`.
+    /// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
+    ///    The list of layers on the remote comes from `index_part`.
+    ///    The list of local layers is given by the layer map's `iter_historic_layers()`.
+    ///    So, the layer map must have been loaded already.
+    /// 3. Schedule upload of local-only layer files (which will then also update the remote
+    ///    IndexPart to include the new layer files).
     ///
-    /// This is used during tenant attach. The layer map must have been loaded
-    /// with local filesystem contents already.
-    ///
-    /// The caller should provide IndexPart if it exists on the remote storage. If it's None,
-    /// we assume that it is missing on the remote storage, which means that we initialized
-    /// a timeline and then restarted before successful upload was performed
+    /// Refer to the `storage_sync` module comment for more context.
     ///
+    /// # TODO
+    /// May be a bit cleaner to do things based on populated remote client,
+    /// and then do things based on its upload_queue.latest_files.
     #[instrument(skip(self, index_part, up_to_date_metadata))]
     pub async fn reconcile_with_remote(
         &self,
@@ -1189,8 +1330,7 @@ impl Timeline {
                     index_part.timeline_layers.len()
                 );
                 remote_client.init_upload_queue(index_part)?;
-
-                self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn)
+                self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
                     .await?
             }
             None => {
@@ -1202,7 +1342,10 @@ impl Timeline {
 
         // Are there local files that don't exist remotely? Schedule uploads for them
         for (layer_name, layer) in &local_only_layers {
-            let layer_path = layer.local_path();
+            // XXX solve this in the type system
+            let layer_path = layer
+                .local_path()
+                .expect("local_only_layers only contains local layers");
             let layer_size = layer_path
                 .metadata()
                 .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
@@ -1211,9 +1354,7 @@ impl Timeline {
             remote_client
                 .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
         }
-        if !local_only_layers.is_empty() {
-            remote_client.schedule_index_upload(up_to_date_metadata)?;
-        }
+        remote_client.schedule_index_upload_for_file_changes()?;
 
         info!("Done");
 
@@ -1221,70 +1362,186 @@ impl Timeline {
     }
 
     fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
-        // Atomically check if the timeline size calculation had already started.
-        // If the flag was not already set, this sets it.
-        if !self
-            .initial_size_computation_started
-            .swap(true, AtomicOrdering::SeqCst)
+        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
+            .try_acquire_owned()
         {
-            // We need to start the computation task.
-            let self_clone = Arc::clone(self);
-            task_mgr::spawn(
-                task_mgr::BACKGROUND_RUNTIME.handle(),
-                task_mgr::TaskKind::InitialLogicalSizeCalculation,
-                Some(self.tenant_id),
-                Some(self.timeline_id),
-                "initial size calculation",
-                false,
-                async move {
-                    let mut timeline_state_updates = self_clone.subscribe_for_state_updates();
-                    let self_calculation = Arc::clone(&self_clone);
-                    tokio::select! {
-                        calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
-                            let calculated_size = calculation_result
-                                .context("Failed to spawn calculation result task")?
-                                .context("Failed to calculate logical size")?;
-                            match self_clone.current_logical_size.initial_logical_size.set(calculated_size) {
-                                Ok(()) => info!("Successfully calculated initial logical size"),
-                                Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
-                            }
-                            Ok(())
-                        },
-                        new_event = async {
-                            loop {
-                                match timeline_state_updates.changed().await {
-                                    Ok(()) => {
-                                        let new_state = *timeline_state_updates.borrow();
-                                        match new_state {
-                                            // we're running this job for active timelines only
-                                            TimelineState::Active => continue,
-                                            TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
-                                        }
-                                    }
-                                    Err(_sender_dropped_error) => return None,
-                                }
-                            }
-                        } => {
-                            match new_event {
-                                Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"),
-                                None => info!("Timeline dropped state updates sender, stopping init size calculation"),
-                            }
-                            Ok(())
-                        },
+            Ok(permit) => permit,
+            Err(TryAcquireError::NoPermits) => {
+                // computation already ongoing or finished with success
+                return;
+            }
+            Err(TryAcquireError::Closed) => unreachable!("we never call close"),
+        };
+        debug_assert!(self
+            .current_logical_size
+            .initial_logical_size
+            .get()
+            .is_none());
+        // We need to start the computation task.
+        let self_clone = Arc::clone(self);
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "initial size calculation",
+            false,
+            // NB: don't log errors here, task_mgr will do that.
+            async move {
+                let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await
+                {
+                    Ok(s) => s,
+                    Err(CalculateLogicalSizeError::Cancelled) => {
+                        // Don't make noise, this is a common task.
+                        // In the unlikely case that there ihs another call to this function, we'll retry
+                        // because initial_logical_size is still None.
+                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
+                        return Ok(());
                     }
-                }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)),
-            );
+                    x @ Err(_) => x.context("Failed to calculate logical size")?,
+                };
+                match self_clone
+                    .current_logical_size
+                    .initial_logical_size
+                    .set(calculated_size)
+                {
+                    Ok(()) => (),
+                    Err(existing_size) => {
+                        // This shouldn't happen because the semaphore is initialized with 1.
+                        // But if it happens, just complain & report success so there are no further retries.
+                        error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
+                    }
+                }
+                // now that `initial_logical_size.is_some()`, reduce permit count to 0
+                // so that we prevent future callers from spawning this task
+                permit.forget();
+                Ok(())
+            },
+        );
+    }
+
+    pub fn spawn_ondemand_logical_size_calculation(
+        self: &Arc<Self>,
+        lsn: Lsn,
+    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
+        let (sender, receiver) = oneshot::channel();
+        let self_clone = Arc::clone(self);
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "ondemand logical size calculation",
+            false,
+            async move {
+                let res = self_clone.logical_size_calculation_task(lsn).await;
+                let _ = sender.send(res).ok();
+                Ok(()) // Receiver is responsible for handling errors
+            },
+        );
+        receiver
+    }
+
+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    async fn logical_size_calculation_task(
+        self: &Arc<Self>,
+        init_lsn: Lsn,
+    ) -> Result<u64, CalculateLogicalSizeError> {
+        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let self_calculation = Arc::clone(self);
+        let cancel = CancellationToken::new();
+
+        let calculation = async {
+            let cancel = cancel.child_token();
+            tokio::task::spawn_blocking(move || {
+                // Run in a separate thread since this can do a lot of
+                // synchronous file IO without .await inbetween
+                // if there are no RemoteLayers that would require downloading.
+                let h = tokio::runtime::Handle::current();
+                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel))
+            })
+            .await
+            .context("Failed to spawn calculation result task")?
+        };
+        let timeline_state_cancellation = async {
+            loop {
+                match timeline_state_updates.changed().await {
+                    Ok(()) => {
+                        let new_state = *timeline_state_updates.borrow();
+                        match new_state {
+                            // we're running this job for active timelines only
+                            TimelineState::Active => continue,
+                            TimelineState::Broken
+                            | TimelineState::Stopping
+                            | TimelineState::Suspended => {
+                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
+                            }
+                        }
+                    }
+                    Err(_sender_dropped_error) => {
+                        // can't happen, the sender is not dropped as long as the Timeline exists
+                        break "aborted because state watch was dropped".to_string();
+                    }
+                }
+            }
+        };
+
+        let taskmgr_shutdown_cancellation = async {
+            task_mgr::shutdown_watcher().await;
+            "aborted because task_mgr shutdown requested".to_string()
+        };
+
+        tokio::pin!(calculation);
+        loop {
+            tokio::select! {
+                res = &mut calculation =>  { return res }
+                reason = timeline_state_cancellation => {
+                    debug!(reason = reason, "cancelling calculation");
+                    cancel.cancel();
+                    return calculation.await;
+                }
+                reason = taskmgr_shutdown_cancellation => {
+                    debug!(reason = reason, "cancelling calculation");
+                    cancel.cancel();
+                    return calculation.await;
+                }
+            }
         }
     }
 
     /// Calculate the logical size of the database at the latest LSN.
     ///
     /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
+    async fn calculate_logical_size(
+        &self,
+        up_to_lsn: Lsn,
+        cancel: CancellationToken,
+    ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
             self.timeline_id, up_to_lsn
         );
+        // These failpoints are used by python tests to ensure that we don't delete
+        // the timeline while the logical size computation is ongoing.
+        // The first failpoint is used to make this function pause.
+        // Then the python test initiates timeline delete operation in a thread.
+        // It waits for a few seconds, then arms the second failpoint and disables
+        // the first failpoint. The second failpoint prints an error if the timeline
+        // delete code has deleted the on-disk state while we're still running here.
+        // It shouldn't do that. If it does it anyway, the error will be caught
+        // by the test suite, highlighting the problem.
+        fail::fail_point!("timeline-calculate-logical-size-pause");
+        fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
+            if !self
+                .conf
+                .metadata_path(self.timeline_id, self.tenant_id)
+                .exists()
+            {
+                error!("timeline-calculate-logical-size-pre metadata file does not exist")
+            }
+            // need to return something
+            Ok(0)
+        });
         let timer = if up_to_lsn == self.initdb_lsn {
             if let Some(size) = self.current_logical_size.initialized_size() {
                 if size != 0 {
@@ -1300,7 +1557,9 @@ impl Timeline {
         } else {
             self.metrics.logical_size_histo.start_timer()
         };
-        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
+        let logical_size = self
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel)
+            .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
         Ok(logical_size)
@@ -1333,18 +1592,27 @@ trait TraversalLayerExt {
 }
 
 impl TraversalLayerExt for Arc<dyn PersistentLayer> {
-    fn traversal_id(&self) -> String {
-        debug_assert!(
-            self.local_path().to_str().unwrap()
-                .contains(&format!("{}", self.get_timeline_id())),
-            "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
-        );
-        format!("{}", self.local_path().display())
+    fn traversal_id(&self) -> TraversalId {
+        match self.local_path() {
+            Some(local_path) => {
+                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
+                    "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
+                );
+                format!("{}", local_path.display())
+            }
+            None => {
+                format!(
+                    "remote {}/{}",
+                    self.get_timeline_id(),
+                    self.filename().file_name()
+                )
+            }
+        }
     }
 }
 
 impl TraversalLayerExt for Arc<InMemoryLayer> {
-    fn traversal_id(&self) -> String {
+    fn traversal_id(&self) -> TraversalId {
         format!(
             "timeline {} in-memory {}",
             self.get_timeline_id(),
@@ -1367,14 +1635,14 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<()> {
+    ) -> PageReconstructResult<()> {
         // Start from the current timeline.
         let mut timeline_owned;
         let mut timeline = self;
 
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path = Vec::<(ValueReconstructResult, Lsn, TraversalId)>::new();
+        let mut traversal_path = Vec::<TraversalPathItem>::new();
 
         let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
             *cached_lsn
@@ -1394,12 +1662,12 @@ impl Timeline {
             // The function should have updated 'state'
             //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
             match result {
-                ValueReconstructResult::Complete => return Ok(()),
+                ValueReconstructResult::Complete => return PageReconstructResult::Success(()),
                 ValueReconstructResult::Continue => {
                     // If we reached an earlier cached page image, we're done.
                     if cont_lsn == cached_lsn + 1 {
                         self.metrics.materialized_page_cache_hit_counter.inc_by(1);
-                        return Ok(());
+                        return PageReconstructResult::Success(());
                     }
                     if prev_lsn <= cont_lsn {
                         // Didn't make any progress in last iteration. Error out to avoid
@@ -1432,68 +1700,139 @@ impl Timeline {
                     timeline.ancestor_lsn,
                     cont_lsn
                 );
-                let ancestor = timeline.get_ancestor_timeline()?;
+                let ancestor = match timeline.get_ancestor_timeline() {
+                    Ok(timeline) => timeline,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
                 timeline_owned = ancestor;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
-                continue;
+                continue 'outer;
             }
 
-            let layers = timeline.layers.read().unwrap();
+            #[allow(clippy::never_loop)] // see comment at bottom of this loop
+            '_layer_map_search: loop {
+                let remote_layer = {
+                    let layers = timeline.layers.read().unwrap();
 
-            // Check the open and frozen in-memory layers first, in order from newest
-            // to oldest.
-            if let Some(open_layer) = &layers.open_layer {
-                let start_lsn = open_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
-                    // Get all the data needed to reconstruct the page version from this layer.
-                    // But if we have an older cached page image, no need to go past that.
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = open_layer.get_value_reconstruct_data(
-                        key,
-                        lsn_floor..cont_lsn,
-                        reconstruct_state,
-                    )?;
-                    cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
-                    continue;
-                }
-            }
-            for frozen_layer in layers.frozen_layers.iter().rev() {
-                let start_lsn = frozen_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = frozen_layer.get_value_reconstruct_data(
-                        key,
-                        lsn_floor..cont_lsn,
-                        reconstruct_state,
-                    )?;
-                    cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
-                    continue 'outer;
-                }
-            }
+                    // Check the open and frozen in-memory layers first, in order from newest
+                    // to oldest.
+                    if let Some(open_layer) = &layers.open_layer {
+                        let start_lsn = open_layer.get_lsn_range().start;
+                        if cont_lsn > start_lsn {
+                            //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
+                            // Get all the data needed to reconstruct the page version from this layer.
+                            // But if we have an older cached page image, no need to go past that.
+                            let lsn_floor = max(cached_lsn + 1, start_lsn);
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let open_layer = Arc::clone(open_layer);
+                                    move || open_layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    }
+                    for frozen_layer in layers.frozen_layers.iter().rev() {
+                        let start_lsn = frozen_layer.get_lsn_range().start;
+                        if cont_lsn > start_lsn {
+                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
+                            let lsn_floor = max(cached_lsn + 1, start_lsn);
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let frozen_layer = Arc::clone(frozen_layer);
+                                    move || frozen_layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    }
 
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? {
-                //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
-
-                let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = layer.get_value_reconstruct_data(
-                    key,
-                    lsn_floor..cont_lsn,
-                    reconstruct_state,
-                )?;
-                cont_lsn = lsn_floor;
-                traversal_path.push((result, cont_lsn, layer.traversal_id()));
-            } else if timeline.ancestor_timeline.is_some() {
-                // Nothing on this timeline. Traverse to parent
-                result = ValueReconstructResult::Continue;
-                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
-            } else {
-                // Nothing found
-                result = ValueReconstructResult::Missing;
+                    if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
+                        // If it's a remote layer, download it and retry.
+                        if let Some(remote_layer) =
+                            super::storage_layer::downcast_remote_layer(&layer)
+                        {
+                            // TODO: push a breadcrumb to 'traversal_path' to record the fact that
+                            // we downloaded / would need to download this layer.
+                            remote_layer // download happens outside the scope of `layers` guard object
+                        } else {
+                            // Get all the data needed to reconstruct the page version from this layer.
+                            // But if we have an older cached page image, no need to go past that.
+                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let layer = Arc::clone(&layer);
+                                    move || layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    } else if timeline.ancestor_timeline.is_some() {
+                        // Nothing on this timeline. Traverse to parent
+                        result = ValueReconstructResult::Continue;
+                        cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+                        continue 'outer;
+                    } else {
+                        // Nothing found
+                        result = ValueReconstructResult::Missing;
+                        continue 'outer;
+                    }
+                };
+                // Indicate to the caller that we need remote_layer replaced with a downloaded
+                // layer in the layer map. The control flow could be a lot simpler, but the point
+                // of this commit is to prepare this function to
+                // 1. become async
+                // 2. do the download right here, using
+                //    ```
+                //    download_remote_layer().await?;
+                //    continue 'layer_map_search;
+                //    ```
+                // For (2), current rustc requires that the layers lock guard is not in scope.
+                // Hence, the complicated control flow.
+                let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
+                    Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
+                info!(
+                    "need remote layer {}",
+                    remote_layer_as_persistent.traversal_id()
+                );
+                return PageReconstructResult::NeedsDownload(
+                    Weak::clone(&timeline.myself),
+                    Arc::downgrade(&remote_layer),
+                );
             }
         }
     }
@@ -1710,9 +2049,11 @@ impl Timeline {
         let lsn_range = frozen_layer.get_lsn_range();
         let layer_paths_to_upload =
             if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
-                let (partitioning, _lsn) =
-                    self.repartition(self.initdb_lsn, self.get_compaction_target_size())?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)?
+                let (partitioning, _lsn) = self
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size())
+                    .await?;
+                self.create_image_layers(&partitioning, self.initdb_lsn, true)
+                    .await?
             } else {
                 // normal case, write out a L0 delta layer file.
                 let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
@@ -1809,13 +2150,9 @@ impl Timeline {
 
         if let Some(remote_client) = &self.remote_client {
             for (path, layer_metadata) in layer_paths_to_upload {
-                remote_client
-                    .schedule_layer_file_upload(&path, &layer_metadata)
-                    .context("schedule_layer_file_upload")?;
+                remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
             }
-            remote_client
-                .schedule_index_upload(&metadata)
-                .context("schedule_layer_file_upload")?;
+            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
         }
 
         Ok(())
@@ -1853,7 +2190,7 @@ impl Timeline {
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
 
-        self.metrics.current_physical_size_gauge.add(sz);
+        self.metrics.resident_physical_size_gauge.add(sz);
         // update metrics
         self.metrics.num_persistent_files_created.inc_by(1);
         self.metrics.persistent_bytes_written.inc_by(sz);
@@ -1861,15 +2198,28 @@ impl Timeline {
         Ok((new_delta_filename, LayerFileMetadata::new(sz)))
     }
 
-    fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
-        let mut partitioning_guard = self.partitioning.lock().unwrap();
-        if partitioning_guard.1 == Lsn(0)
-            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
+    async fn repartition(
+        &self,
+        lsn: Lsn,
+        partition_size: u64,
+    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
-            let keyspace = self.collect_keyspace(lsn)?;
-            let partitioning = keyspace.partition(partition_size);
+            let partitioning_guard = self.partitioning.lock().unwrap();
+            if partitioning_guard.1 != Lsn(0)
+                && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
+            {
+                // no repartitioning needed
+                return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+            }
+        }
+        let keyspace = self.collect_keyspace(lsn).await?;
+        let partitioning = keyspace.partition(partition_size);
+
+        let mut partitioning_guard = self.partitioning.lock().unwrap();
+        if lsn > partitioning_guard.1 {
             *partitioning_guard = (partitioning, lsn);
-            return Ok((partitioning_guard.0.clone(), lsn));
+        } else {
+            warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless");
         }
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
@@ -1915,7 +2265,7 @@ impl Timeline {
         Ok(false)
     }
 
-    fn create_image_layers(
+    async fn create_image_layers(
         &self,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
@@ -1942,7 +2292,7 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
-                        let img = match self.get(key, lsn) {
+                        let img = match self.get_download(key, lsn).await {
                             Ok(img) => img,
                             Err(err) => {
                                 // If we fail to reconstruct a VM or FSM page, we can zero the
@@ -2005,7 +2355,9 @@ impl Timeline {
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
-            self.metrics.current_physical_size_gauge.add(metadata.len());
+            self.metrics
+                .resident_physical_size_gauge
+                .add(metadata.len());
             layers.insert_historic(Arc::new(l));
         }
         drop(layers);
@@ -2083,38 +2435,40 @@ impl Timeline {
 
         // This iterator walks through all key-value pairs from all the layers
         // we're compacting, in key, LSN order.
-        let all_values_iter = deltas_to_compact
-            .iter()
-            .map(|l| l.iter())
-            .kmerge_by(|a, b| {
-                if let Ok((a_key, a_lsn, _)) = a {
-                    if let Ok((b_key, b_lsn, _)) = b {
-                        match a_key.cmp(b_key) {
-                            Ordering::Less => true,
-                            Ordering::Equal => a_lsn <= b_lsn,
-                            Ordering::Greater => false,
+        let all_values_iter =
+            itertools::process_results(deltas_to_compact.iter().map(|l| l.iter()), |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    if let Ok((a_key, a_lsn, _)) = a {
+                        if let Ok((b_key, b_lsn, _)) = b {
+                            match a_key.cmp(b_key) {
+                                Ordering::Less => true,
+                                Ordering::Equal => a_lsn <= b_lsn,
+                                Ordering::Greater => false,
+                            }
+                        } else {
+                            false
                         }
                     } else {
-                        false
+                        true
                     }
-                } else {
-                    true
-                }
-            });
+                })
+            })?;
 
         // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = deltas_to_compact
-            .iter()
-            .map(|l| l.key_iter())
-            .kmerge_by(|a, b| {
-                let (a_key, a_lsn, _) = a;
-                let (b_key, b_lsn, _) = b;
-                match a_key.cmp(b_key) {
-                    Ordering::Less => true,
-                    Ordering::Equal => a_lsn <= b_lsn,
-                    Ordering::Greater => false,
-                }
-            });
+        let mut all_keys_iter = itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.key_iter()),
+            |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    match a_key.cmp(b_key) {
+                        Ordering::Less => true,
+                        Ordering::Equal => a_lsn <= b_lsn,
+                        Ordering::Greater => false,
+                    }
+                })
+            },
+        )?;
 
         // Merge the contents of all the input delta layers into a new set
         // of delta layers, based on the current partitioning.
@@ -2284,6 +2638,11 @@ impl Timeline {
             deltas_to_compact,
         } = self.compact_level0_phase1(target_file_size).await?;
 
+        if new_layers.is_empty() && deltas_to_compact.is_empty() {
+            // nothing to do
+            return Ok(());
+        }
+
         // Before deleting any layers, we need to wait for their upload ops to finish.
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
@@ -2310,7 +2669,9 @@ impl Timeline {
             }
 
             // update the timeline's physical size
-            self.metrics.current_physical_size_gauge.add(metadata.len());
+            self.metrics
+                .resident_physical_size_gauge
+                .add(metadata.len());
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
@@ -2321,10 +2682,11 @@ impl Timeline {
         // delete the old ones
         let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
         for l in deltas_to_compact {
-            let path = l.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
+            if let Some(path) = l.local_path() {
+                self.metrics
+                    .resident_physical_size_gauge
+                    .sub(path.metadata()?.len());
+            }
             layer_names_to_delete.push(l.filename());
             l.delete()?;
             layers.remove_historic(l);
@@ -2365,55 +2727,71 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
-    pub(super) fn update_gc_info(
+    ///
+    /// NOTE: This function holds a short-lived lock to protect the 'gc_info'
+    /// field, so that the three values passed as argument are stored
+    /// atomically. But the caller is responsible for ensuring that no new
+    /// branches are created that would need to be included in 'retain_lsns',
+    /// for example. The caller should hold `Tenant::gc_cs` lock to ensure
+    /// that.
+    ///
+    pub(super) async fn update_gc_info(
         &self,
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
     ) -> anyhow::Result<()> {
-        let mut gc_info = self.gc_info.write().unwrap();
-
-        gc_info.horizon_cutoff = cutoff_horizon;
-        gc_info.retain_lsns = retain_lsns;
-
-        // Calculate pitr cutoff point.
-        // If we cannot determine a cutoff LSN, be conservative and don't GC anything.
-        let mut pitr_cutoff_lsn: Lsn;
-
-        if pitr != Duration::ZERO {
-            // conservative, safe default is to remove nothing, when we have no
-            // commit timestamp data available
-            pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn();
-
-            // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
-            // If we don't have enough data to convert to LSN,
-            // play safe and don't remove any layers.
+        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
+        //
+        // Some unit tests depend on garbage-collection working even when
+        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
+        // work, so avoid calling it altogether if time-based retention is not
+        // configured. It would be pointless anyway.
+        let pitr_cutoff = if pitr != Duration::ZERO {
             let now = SystemTime::now();
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self.find_lsn_for_timestamp(pitr_timestamp)? {
-                    LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
+                match with_ondemand_download(|| self.find_lsn_for_timestamp(pitr_timestamp)).await?
+                {
+                    LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {
+                        // The timestamp is in the future. That sounds impossible,
+                        // but what it really means is that there hasn't been
+                        // any commits since the cutoff timestamp.
                         debug!("future({})", lsn);
-                        pitr_cutoff_lsn = gc_info.horizon_cutoff;
+                        cutoff_horizon
                     }
                     LsnForTimestamp::Past(lsn) => {
                         debug!("past({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
                     }
                     LsnForTimestamp::NoData(lsn) => {
                         debug!("nodata({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
                     }
                 }
-                debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
+            } else {
+                // If we don't have enough data to convert to LSN,
+                // play safe and don't remove any layers.
+                *self.get_latest_gc_cutoff_lsn()
             }
         } else {
-            // No time-based retention. (Some unit tests depend on garbage-collection
-            // working even when CLOG data is missing, so that find_lsn_for_timestamp()
-            // above doesn't work.)
-            pitr_cutoff_lsn = gc_info.horizon_cutoff;
-        }
-        gc_info.pitr_cutoff = pitr_cutoff_lsn;
+            // No time-based retention was configured. Set time-based cutoff to
+            // same as LSN based.
+            cutoff_horizon
+        };
+
+        // Grab the lock and update the values
+        *self.gc_info.write().unwrap() = GcInfo {
+            retain_lsns,
+            horizon_cutoff: cutoff_horizon,
+            pitr_cutoff,
+        };
 
         Ok(())
     }
@@ -2487,9 +2865,6 @@ impl Timeline {
             );
             write_guard.store_and_unlock(new_gc_cutoff).wait();
         }
-        // Persist the new GC cutoff value in the metadata file, before
-        // we actually remove anything.
-        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
 
         info!("GC starting");
 
@@ -2600,19 +2975,34 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
-        // Actually delete the layers from disk and remove them from the map.
-        // (couldn't do this in the loop above, because you cannot modify a collection
-        // while iterating it. BTreeMap::retain() would be another option)
-        let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
-        for doomed_layer in layers_to_remove {
-            let path = doomed_layer.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
-            layer_names_to_delete.push(doomed_layer.filename());
-            doomed_layer.delete()?;
-            layers.remove_historic(doomed_layer);
-            result.layers_removed += 1;
+        if !layers_to_remove.is_empty() {
+            // Persist the new GC cutoff value in the metadata file, before
+            // we actually remove anything.
+            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+
+            // Actually delete the layers from disk and remove them from the map.
+            // (couldn't do this in the loop above, because you cannot modify a collection
+            // while iterating it. BTreeMap::retain() would be another option)
+            let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
+            for doomed_layer in layers_to_remove {
+                if let Some(path) = doomed_layer.local_path() {
+                    self.metrics
+                        .resident_physical_size_gauge
+                        .sub(path.metadata()?.len());
+                }
+                layer_names_to_delete.push(doomed_layer.filename());
+                doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
+                layers.remove_historic(doomed_layer);
+                result.layers_removed += 1;
+            }
+
+            if result.layers_removed != 0 {
+                fail_point!("after-timeline-gc-removed-layers");
+            }
+
+            if let Some(remote_client) = &self.remote_client {
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            }
         }
 
         info!(
@@ -2620,14 +3010,6 @@ impl Timeline {
             result.layers_removed, new_gc_cutoff
         );
 
-        if result.layers_removed != 0 {
-            fail_point!("after-timeline-gc-removed-layers");
-        }
-
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
-        }
-
         result.elapsed = now.elapsed()?;
         Ok(result)
     }
@@ -2640,7 +3022,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         mut data: ValueReconstructState,
-    ) -> anyhow::Result<Bytes> {
+    ) -> PageReconstructResult<Bytes> {
         // Perform WAL redo if needed
         data.records.reverse();
 
@@ -2652,9 +3034,11 @@ impl Timeline {
                     key,
                     img_lsn
                 );
-                Ok(img.clone())
+                PageReconstructResult::Success(img.clone())
             } else {
-                bail!("base image for {} at {} not found", key, request_lsn);
+                PageReconstructResult::from(anyhow!(
+                    "base image for {key} at {request_lsn} not found"
+                ))
             }
         } else {
             // We need to do WAL redo.
@@ -2662,12 +3046,12 @@ impl Timeline {
             // If we don't have a base image, then the oldest WAL record better initialize
             // the page
             if data.img.is_none() && !data.records.first().unwrap().1.will_init() {
-                bail!(
+                PageReconstructResult::from(anyhow!(
                     "Base image for {} at {} not found, but got {} WAL records",
                     key,
                     request_lsn,
                     data.records.len()
-                );
+                ))
             } else {
                 if data.img.is_some() {
                     trace!(
@@ -2682,14 +3066,18 @@ impl Timeline {
 
                 let last_rec_lsn = data.records.last().unwrap().0;
 
-                let img = self
+                let img = match self
                     .walredo_mgr
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .context("Failed to reconstruct a page image:")?;
+                    .context("Failed to reconstruct a page image:")
+                {
+                    Ok(img) => img,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
 
                 if img.len() == page_cache::PAGE_SZ {
                     let cache = page_cache::get();
-                    cache
+                    if let Err(e) = cache
                         .memorize_materialized_page(
                             self.tenant_id,
                             self.timeline_id,
@@ -2697,29 +3085,347 @@ impl Timeline {
                             last_rec_lsn,
                             &img,
                         )
-                        .context("Materialized page memoization failed")?;
+                        .context("Materialized page memoization failed")
+                    {
+                        return PageReconstructResult::from(e);
+                    }
                 }
 
-                Ok(img)
+                PageReconstructResult::Success(img)
+            }
+        }
+    }
+
+    /// Download a layer file from remote storage and insert it into the layer map.
+    ///
+    /// It's safe to call this function for the same layer concurrently. In that case:
+    /// - If the layer has already been downloaded, `OK(...)` is returned.
+    /// - If the layer is currently being downloaded, we wait until that download succeeded / failed.
+    ///     - If it succeeded, we return `Ok(...)`.
+    ///     - If it failed, we or another concurrent caller will initiate a new download attempt.
+    ///
+    /// Download errors are classified and retried if appropriate by the underlying RemoteTimelineClient function.
+    /// It has an internal limit for the maximum number of retries and prints appropriate log messages.
+    /// If we exceed the limit, it returns an error, and this function passes it through.
+    /// The caller _could_ retry further by themselves by calling this function again, but _should not_ do it.
+    /// The reason is that they cannot distinguish permanent errors from temporary ones, whereas
+    /// the underlying RemoteTimelineClient can.
+    ///
+    /// There is no internal timeout or slowness detection.
+    /// If the caller has a deadline or needs a timeout, they can simply stop polling:
+    /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
+    /// So, the current download attempt will run to completion even if we stop polling.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
+    pub async fn download_remote_layer(
+        self: Arc<Self>,
+        remote_layer: Arc<RemoteLayer>,
+    ) -> anyhow::Result<()> {
+        let permit = match Arc::clone(&remote_layer.ongoing_download)
+            .acquire_owned()
+            .await
+        {
+            Ok(permit) => permit,
+            Err(_closed) => {
+                info!("download of layer has already finished");
+                return Ok(());
+            }
+        };
+
+        let (sender, receiver) = tokio::sync::oneshot::channel();
+        // Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline.
+        task_mgr::spawn(
+            &tokio::runtime::Handle::current(),
+            TaskKind::RemoteDownloadTask,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            &format!("download layer {}", remote_layer.short_id()),
+            false,
+            async move {
+                let remote_client = self.remote_client.as_ref().unwrap();
+
+                // Does retries + exponential back-off internally.
+                // When this fails, don't layer further retry attempts here.
+                let result = remote_client
+                    .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
+                    .await;
+
+                if let Ok(size) = &result {
+                    // XXX the temp file is still around in Err() case
+                    // and consumes space until we clean up upon pageserver restart.
+                    self.metrics.resident_physical_size_gauge.add(*size);
+
+                    // Download complete. Replace the RemoteLayer with the corresponding
+                    // Delta- or ImageLayer in the layer map.
+                    let new_layer = remote_layer.create_downloaded_layer(self.conf, *size);
+                    let mut layers = self.layers.write().unwrap();
+                    {
+                        let l: Arc<dyn PersistentLayer> = remote_layer.clone();
+                        layers.remove_historic(l);
+                    }
+                    layers.insert_historic(new_layer);
+                    drop(layers);
+
+                    // Now that we've inserted the download into the layer map,
+                    // close the semaphore. This will make other waiters for
+                    // this download return Ok(()).
+                    assert!(!remote_layer.ongoing_download.is_closed());
+                    remote_layer.ongoing_download.close();
+                } else {
+                    // Keep semaphore open. We'll drop the permit at the end of the function.
+                }
+
+                // Don't treat it as an error if the task that triggered the download
+                // is no longer interested in the result.
+                sender.send(result.map(|_sz| ())).ok();
+
+                // In case we failed and there are other waiters, this will make one
+                // of them retry the download in a new task.
+                // XXX: This resets the exponential backoff because it's a new call to
+                // download_layer file.
+                drop(permit);
+
+                Ok(())
+            },
+        );
+
+        receiver.await.context("download task cancelled")?
+    }
+
+    pub async fn spawn_download_all_remote_layers(
+        self: Arc<Self>,
+    ) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
+        let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap();
+        if let Some(st) = &*status_guard {
+            match &st.state {
+                DownloadRemoteLayersTaskState::Running => {
+                    return Err(st.clone());
+                }
+                DownloadRemoteLayersTaskState::ShutDown
+                | DownloadRemoteLayersTaskState::Completed => {
+                    *status_guard = None;
+                }
+            }
+        }
+
+        let self_clone = Arc::clone(&self);
+        let task_id = task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::DownloadAllRemoteLayers,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "download all remote layers task",
+            false,
+            async move {
+                self_clone.download_all_remote_layers().await;
+                let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
+                 match &mut *status_guard {
+                    None => {
+                        warn!("tasks status is supposed to be Some(), since we are running");
+                    }
+                    Some(st) => {
+                        let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap());
+                        if st.task_id != exp_task_id {
+                            warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id);
+                        } else {
+                            st.state = DownloadRemoteLayersTaskState::Completed;
+                        }
+                    }
+                };
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id))
+        );
+
+        let initial_info = DownloadRemoteLayersTaskInfo {
+            task_id: format!("{task_id}"),
+            state: DownloadRemoteLayersTaskState::Running,
+            total_layer_count: 0,
+            successful_download_count: 0,
+            failed_download_count: 0,
+        };
+        *status_guard = Some(initial_info.clone());
+
+        Ok(initial_info)
+    }
+
+    async fn download_all_remote_layers(self: &Arc<Self>) {
+        let mut downloads: FuturesUnordered<_> = {
+            let layers = self.layers.read().unwrap();
+            layers
+                .iter_historic_layers()
+                .filter_map(|l| l.downcast_remote_layer())
+                .map({
+                    |l| {
+                        let self_clone = Arc::clone(self);
+                        self_clone.download_remote_layer(l)
+                    }
+                })
+                .collect()
+        };
+
+        macro_rules! lock_status {
+            ($st:ident) => {
+                let mut st = self.download_all_remote_layers_task_info.write().unwrap();
+                let st = st
+                    .as_mut()
+                    .expect("this function is only called after the task has been spawned");
+                assert_eq!(
+                    st.task_id,
+                    format!(
+                        "{}",
+                        task_mgr::current_task_id().expect("we run inside a task_mgr task")
+                    )
+                );
+                let $st = st;
+            };
+        }
+
+        {
+            lock_status!(st);
+            st.total_layer_count = downloads.len().try_into().unwrap();
+        }
+        loop {
+            tokio::select! {
+                dl = downloads.next() => {
+                    lock_status!(st);
+                    match dl {
+                        None => break,
+                        Some(Ok(())) => {
+                            st.successful_download_count += 1;
+                        },
+                        Some(Err(e)) => {
+                            error!(error = %e, "layer download failed");
+                            st.failed_download_count += 1;
+                        }
+                    }
+                }
+                _ = task_mgr::shutdown_watcher() => {
+                    // Kind of pointless to watch for shutdowns here,
+                    // as download_remote_layer spawns other task_mgr tasks internally.
+                    lock_status!(st);
+                    st.state = DownloadRemoteLayersTaskState::ShutDown;
+                }
+            }
+        }
+        {
+            lock_status!(st);
+            st.state = DownloadRemoteLayersTaskState::Completed;
+        }
+    }
+
+    pub fn get_download_all_remote_layers_task_info(&self) -> Option<DownloadRemoteLayersTaskInfo> {
+        self.download_all_remote_layers_task_info
+            .read()
+            .unwrap()
+            .clone()
+    }
+}
+
+/// Helper function to deal with [`PageReconstructResult`].
+///
+/// Takes a sync closure that returns a [`PageReconstructResult`].
+/// If it is [`PageReconstructResult::NeedsDownload`],
+/// do the download and retry the closure.
+///
+/// ### Background
+///
+/// This is a crutch to make on-demand downloads efficient in
+/// our async-sync-async sandwich codebase. Some context:
+///
+/// - The code that does the downloads uses async Rust.
+/// - The code that initiates download is many levels of sync Rust.
+/// - The sync code must wait for the download to finish to
+///   make further progress.
+/// - The sync code is invoked directly from async functions upstack.
+///
+/// Example (there are also much worse ones where the sandwich is taller)
+///
+///   async handle_get_page_at_lsn_request        page_service.rs
+///     sync get_rel_page_at_lsn                  timeline.rs
+///       sync timeline.get                       timeline.rs
+///         sync get_reconstruct_data             timeline.rs
+///           async download_remote_layer         timeline.rs
+///
+/// It is not possible to Timeline::download_remote_layer().await within
+/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`]
+/// which contains references to the [`Timeline`] and [`RemoteLayer`].
+/// We bubble that error upstack to the async code, which can then call
+/// `Timeline::download_remote_layer().await`.
+/// That is _efficient_ because tokio can use the same OS thread to do
+/// other work while we're waiting for the download.
+///
+/// It is a deliberate decision to use a new result type to communicate
+/// the need for download instead of adding another variant to [`PageReconstructError`].
+/// The reason is that with the latter approach, any place that does
+/// `?` on a `Result<T, PageReconstructError>` will implicitly ignore the
+/// need for download. We want that to be explicit, so that
+/// - the code base becomes greppable for places that don't do a download
+/// - future code changes will need to explicilty address for on-demand download
+///
+/// Alternatives to consider in the future:
+///
+/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread
+///   and use it to block_on the download_remote_layer future.
+///   That is obviously inefficient as it creates one thread per download.
+/// - Convert everything to async. The problem here is that the sync
+///   functions are used by many other sync functions. So, the scope
+///   creep of such a conversion is tremendous.
+/// - Compromise between the two: implement async functions for each sync
+///   function. Switch over the hot code paths (GetPage()) to use the
+///   async path, so that the hot path doesn't  spawn threads. Other code
+///   paths would remain sync initially, and get converted to async over time.
+///
+pub async fn with_ondemand_download<F, T>(mut f: F) -> Result<T, anyhow::Error>
+where
+    F: Send + FnMut() -> PageReconstructResult<T>,
+    T: Send,
+{
+    loop {
+        let closure_result = f();
+        match closure_result {
+            PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => {
+                // if the timeline is gone, it has likely been deleted / tenant detached
+                let tl = weak_timeline.upgrade().context("timeline is gone")?;
+                // if the remote layer got removed, retry the function, it might succeed now
+                let remote_layer = match weak_remote_layer.upgrade() {
+                    None => {
+                        info!("remote layer is gone, retrying closure");
+                        continue;
+                    }
+                    Some(l) => l,
+                };
+                // Does retries internally
+                tl.download_remote_layer(remote_layer).await?;
+                // Download successful, retry the closure
+                continue;
+            }
+            PageReconstructResult::Success(closure_value) => return Ok(closure_value),
+            PageReconstructResult::Error(e) => {
+                return Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
             }
         }
     }
 }
 
+type TraversalPathItem = (
+    ValueReconstructResult,
+    Lsn,
+    Box<dyn FnOnce() -> TraversalId>,
+);
+
 /// Helper function for get_reconstruct_data() to add the path of layers traversed
 /// to an error, as anyhow context information.
-fn layer_traversal_error(
-    msg: String,
-    path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
-) -> anyhow::Result<()> {
+fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructResult<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
     let mut msg_iter = path
-        .iter()
+        .into_iter()
         .map(|(r, c, l)| {
             format!(
                 "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r, c, l,
+                r,
+                c,
+                l(),
             )
         })
         .chain(std::iter::once(msg));
@@ -2727,7 +3433,8 @@ fn layer_traversal_error(
     let err = anyhow!(msg_iter.next().unwrap());
 
     // Append all subsequent traversals, and the error message 'msg', as contexts.
-    Err(msg_iter.fold(err, |err, msg| err.context(msg)))
+    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
+    PageReconstructResult::from(msg)
 }
 
 /// Various functions to mutate the timeline.
@@ -2787,9 +3494,9 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
     let mut new_path = path.to_owned();
 
     for i in 0u32.. {
-        new_path.set_file_name(format!("{}.{}.old", filename, i));
+        new_path.set_file_name(format!("{filename}.{i}.old"));
         if !new_path.exists() {
-            std::fs::rename(&path, &new_path)?;
+            std::fs::rename(path, &new_path)?;
             return Ok(());
         }
     }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
new file mode 100644
index 0000000000..790b2f59aa
--- /dev/null
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -0,0 +1,213 @@
+use crate::metrics::RemoteOpFileKind;
+
+use super::storage_layer::LayerFileName;
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use std::collections::{HashMap, VecDeque};
+use std::fmt::Debug;
+
+use std::sync::Arc;
+use tracing::info;
+
+use std::sync::atomic::AtomicU32;
+use utils::lsn::Lsn;
+
+// clippy warns that Uninitialized is much smaller than Initialized, which wastes
+// memory for Uninitialized variants. Doesn't matter in practice, there are not
+// that many upload queues in a running pageserver, and most of them are initialized
+// anyway.
+#[allow(clippy::large_enum_variant)]
+pub(crate) enum UploadQueue {
+    Uninitialized,
+    Initialized(UploadQueueInitialized),
+    Stopped(UploadQueueStopped),
+}
+
+impl UploadQueue {
+    fn as_str(&self) -> &'static str {
+        match self {
+            UploadQueue::Uninitialized => "Uninitialized",
+            UploadQueue::Initialized(_) => "Initialized",
+            UploadQueue::Stopped(_) => "Stopped",
+        }
+    }
+}
+
+/// This keeps track of queued and in-progress tasks.
+pub(crate) struct UploadQueueInitialized {
+    /// Counter to assign task IDs
+    pub(crate) task_counter: u64,
+
+    /// All layer files stored in the remote storage, taking into account all
+    /// in-progress and queued operations
+    pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+
+    /// How many file uploads or deletions been scheduled, since the
+    /// last (scheduling of) metadata index upload?
+    pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
+
+    /// Metadata stored in the remote storage, taking into account all
+    /// in-progress and queued operations.
+    /// DANGER: do not return to outside world, e.g., safekeepers.
+    pub(crate) latest_metadata: TimelineMetadata,
+
+    /// `disk_consistent_lsn` from the last metadata file that was successfully
+    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
+    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
+    /// Safekeeper can rely on it to make decisions for WAL storage.
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+
+    // Breakdown of different kinds of tasks currently in-progress
+    pub(crate) num_inprogress_layer_uploads: usize,
+    pub(crate) num_inprogress_metadata_uploads: usize,
+    pub(crate) num_inprogress_deletions: usize,
+
+    /// Tasks that are currently in-progress. In-progress means that a tokio Task
+    /// has been launched for it. An in-progress task can be busy uploading, but it can
+    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
+    /// be waiting for retry in `exponential_backoff`.
+    pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
+
+    /// Queued operations that have not been launched yet. They might depend on previous
+    /// tasks to finish. For example, metadata upload cannot be performed before all
+    /// preceding layer file uploads have completed.
+    pub(crate) queued_operations: VecDeque<UploadOp>,
+}
+
+pub(crate) struct UploadQueueStopped {
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+}
+
+impl UploadQueue {
+    pub(crate) fn initialize_empty_remote(
+        &mut self,
+        metadata: &TimelineMetadata,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        info!("initializing upload queue for empty remote");
+
+        let state = UploadQueueInitialized {
+            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
+            latest_files: HashMap::new(),
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: metadata.clone(),
+            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
+            // safekeepers from garbage-collecting anything.
+            last_uploaded_consistent_lsn: Lsn(0),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialize_with_current_remote_index_part(
+        &mut self,
+        index_part: &IndexPart,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
+        for layer_name in &index_part.timeline_layers {
+            let layer_metadata = index_part
+                .layer_metadata
+                .get(layer_name)
+                .map(LayerFileMetadata::from)
+                .unwrap_or(LayerFileMetadata::MISSING);
+            files.insert(layer_name.to_owned(), layer_metadata);
+        }
+
+        let index_part_metadata = index_part.parse_metadata()?;
+        info!(
+            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
+            index_part_metadata.disk_consistent_lsn()
+        );
+
+        let state = UploadQueueInitialized {
+            latest_files: files,
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: index_part_metadata.clone(),
+            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Initialized(x) => Ok(x),
+        }
+    }
+}
+
+/// An in-progress upload or delete task.
+#[derive(Debug)]
+pub(crate) struct UploadTask {
+    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
+    pub(crate) task_id: u64,
+    pub(crate) retries: AtomicU32,
+
+    pub(crate) op: UploadOp,
+}
+
+#[derive(Debug)]
+pub(crate) enum UploadOp {
+    /// Upload a layer file
+    UploadLayer(LayerFileName, LayerFileMetadata),
+
+    /// Upload the metadata file
+    UploadMetadata(IndexPart, Lsn),
+
+    /// Delete a file.
+    Delete(RemoteOpFileKind, LayerFileName),
+
+    /// Barrier. When the barrier operation is reached,
+    Barrier(tokio::sync::watch::Sender<()>),
+}
+
+impl std::fmt::Display for UploadOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            UploadOp::UploadLayer(path, metadata) => {
+                write!(
+                    f,
+                    "UploadLayer({}, size={:?})",
+                    path.file_name(),
+                    metadata.file_size()
+                )
+            }
+            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
+            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Barrier(_) => write!(f, "Barrier"),
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 46e4acd50c..fb216123c1 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -12,7 +12,7 @@
 //!
 use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
 use once_cell::sync::OnceCell;
-use std::fs::{File, OpenOptions};
+use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -240,6 +240,10 @@ impl VirtualFile {
         self.with_file("fsync", |file| file.sync_all())?
     }
 
+    pub fn metadata(&self) -> Result<fs::Metadata, Error> {
+        self.with_file("metadata", |file| file.metadata())?
+    }
+
     /// Helper function that looks up the underlying File for this VirtualFile,
     /// opening it and evicting some other File if necessary. It calls 'func'
     /// with the physical File.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e8a2e99f06..1c974f7e2a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,6 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
-use anyhow::Context;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -32,6 +31,7 @@ use tracing::*;
 
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
+use crate::tenant::{with_ondemand_download, PageReconstructError};
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -52,10 +52,11 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result<WalIngest> {
+    pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint)?;
+        let checkpoint_bytes =
+            with_ondemand_download(|| timeline.get_checkpoint(startpoint)).await?;
         let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
@@ -74,16 +75,15 @@ impl<'a> WalIngest<'a> {
     /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
     /// relations/pages that the record affects.
     ///
-    pub fn ingest_record(
+    pub async fn ingest_record(
         &mut self,
         recdata: Bytes,
         lsn: Lsn,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         modification.lsn = lsn;
-        decode_wal_record(recdata, decoded, self.timeline.pg_version)
-            .context("failed decoding wal record")?;
+        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -98,7 +98,8 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            self.ingest_heapam_record(&mut buf, modification, decoded)?;
+            self.ingest_heapam_record(&mut buf, modification, decoded)
+                .await?;
         }
         // Handle other special record types
         if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -112,7 +113,8 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate)?;
+            self.ingest_xlog_smgr_truncate(modification, &truncate)
+                .await?;
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
                 "handle RM_DBASE_ID for Postgres version {:?}",
@@ -125,7 +127,8 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    self.ingest_xlog_dbase_create(modification, &createdb)?;
+                    self.ingest_xlog_dbase_create(modification, &createdb)
+                        .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
                 {
@@ -148,7 +151,8 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb)?;
+                    self.ingest_xlog_dbase_create(modification, &createdb)
+                        .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
                 {
@@ -173,11 +177,13 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                )
+                .await?;
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec)?;
+                self.ingest_clog_truncate_record(modification, &xlrec)
+                    .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
             let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
@@ -188,7 +194,8 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
-                )?;
+                )
+                .await?;
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                 || info == pg_constants::XLOG_XACT_ABORT_PREPARED
             {
@@ -198,7 +205,8 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                )?;
+                )
+                .await?;
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
                 trace!(
                     "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
@@ -223,7 +231,8 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                )
+                .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -234,7 +243,8 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                )
+                .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                 let xlrec = XlMultiXactCreate::decode(&mut buf);
                 self.ingest_multixact_create_record(modification, &xlrec)?;
@@ -279,7 +289,8 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(modification, lsn, decoded, blk)?;
+            self.ingest_decoded_block(modification, lsn, decoded, blk)
+                .await?;
         }
 
         // If checkpoint data was updated, store the new version in the repository
@@ -297,18 +308,18 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn ingest_decoded_block(
+    async fn ingest_decoded_block(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
-    ) -> Result<()> {
+    ) -> Result<(), PageReconstructError> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
             dbnode: blk.rnode_dbnode,
             relnode: blk.rnode_relnode,
-            forknum: blk.forknum as u8,
+            forknum: blk.forknum,
         };
 
         //
@@ -345,23 +356,25 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?;
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
+                .await?;
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec)?;
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec)
+                .await?;
         }
         Ok(())
     }
 
-    fn ingest_heapam_record(
+    async fn ingest_heapam_record(
         &mut self,
         buf: &mut Bytes,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Handle VM bit updates that are implicitly part of heap records.
 
         // First, look at the record to determine which VM bits need
@@ -440,7 +453,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -465,7 +478,8 @@ impl<'a> WalIngest<'a> {
                             old_heap_blkno,
                             flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                         },
-                    )?;
+                    )
+                    .await?;
                 } else {
                     // Clear VM bits for one heap page, or for two pages that reside on
                     // different VM pages.
@@ -479,7 +493,8 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno: None,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
-                        )?;
+                        )
+                        .await?;
                     }
                     if let Some(old_vm_blk) = old_vm_blk {
                         self.put_rel_wal_record(
@@ -491,7 +506,8 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
-                        )?;
+                        )
+                        .await?;
                     }
                 }
             }
@@ -501,11 +517,11 @@ impl<'a> WalIngest<'a> {
     }
 
     /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
-    fn ingest_xlog_dbase_create(
+    async fn ingest_xlog_dbase_create(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rec: &XlCreateDatabase,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
         let tablespace_id = rec.tablespace_id;
         let src_db_id = rec.src_db_id;
@@ -518,16 +534,22 @@ impl<'a> WalIngest<'a> {
         // get calls instead.
         let req_lsn = modification.tline.get_last_record_lsn();
 
-        let rels = modification
-            .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)?;
+        let rels = with_ondemand_download(|| {
+            modification
+                .tline
+                .list_rels(src_tablespace_id, src_db_id, req_lsn)
+        })
+        .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
 
         // Copy relfilemap
-        let filemap = modification
-            .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?;
+        let filemap = with_ondemand_download(|| {
+            modification
+                .tline
+                .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+        })
+        .await?;
         modification.put_relmap_file(tablespace_id, db_id, filemap)?;
 
         let mut num_rels_copied = 0;
@@ -536,7 +558,9 @@ impl<'a> WalIngest<'a> {
             assert_eq!(src_rel.spcnode, src_tablespace_id);
             assert_eq!(src_rel.dbnode, src_db_id);
 
-            let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?;
+            let nblocks =
+                with_ondemand_download(|| modification.tline.get_rel_size(src_rel, req_lsn, true))
+                    .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
                 dbnode: db_id,
@@ -551,9 +575,12 @@ impl<'a> WalIngest<'a> {
             for blknum in 0..nblocks {
                 debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
 
-                let content = modification
-                    .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?;
+                let content = with_ondemand_download(|| {
+                    modification
+                        .tline
+                        .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                })
+                .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
             }
@@ -572,7 +599,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification,
         rec: &XlSmgrCreate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let rel = RelTag {
             spcnode: rec.rnode.spcnode,
             dbnode: rec.rnode.dbnode,
@@ -586,11 +613,11 @@ impl<'a> WalIngest<'a> {
     /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
     ///
     /// This is the same logic as in PostgreSQL's smgr_redo() function.
-    fn ingest_xlog_smgr_truncate(
+    async fn ingest_xlog_smgr_truncate(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rec: &XlSmgrTruncate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let spcnode = rec.rnode.spcnode;
         let dbnode = rec.rnode.dbnode;
         let relnode = rec.rnode.relnode;
@@ -620,7 +647,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            let nblocks = self.get_relsize(rel, modification.lsn).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
@@ -641,7 +668,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            let nblocks = self.get_relsize(rel, modification.lsn).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no)?;
@@ -652,12 +679,12 @@ impl<'a> WalIngest<'a> {
 
     /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
     ///
-    fn ingest_xact_record(
+    async fn ingest_xact_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
         let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -713,7 +740,9 @@ impl<'a> WalIngest<'a> {
                     relnode: xnode.relnode,
                 };
                 let last_lsn = self.timeline.get_last_record_lsn();
-                if modification.tline.get_rel_exists(rel, last_lsn, true)? {
+                if with_ondemand_download(|| modification.tline.get_rel_exists(rel, last_lsn, true))
+                    .await?
+                {
                     self.put_rel_drop(modification, rel)?;
                 }
             }
@@ -721,11 +750,11 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn ingest_clog_truncate_record(
+    async fn ingest_clog_truncate_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         xlrec: &XlClogTruncate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
             xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
@@ -765,10 +794,14 @@ impl<'a> WalIngest<'a> {
         // it. So we use the previous record's LSN in the get calls
         // instead.
         let req_lsn = modification.tline.get_last_record_lsn();
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)?
-        {
+
+        let slru_segments = with_ondemand_download(|| {
+            modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, req_lsn)
+        })
+        .await?;
+        for segno in slru_segments {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                 modification.drop_slru_segment(SlruKind::Clog, segno)?;
@@ -917,26 +950,26 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn put_rel_page_image(
+    async fn put_rel_page_image(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)?;
+    ) -> anyhow::Result<()> {
+        self.handle_rel_extend(modification, rel, blknum).await?;
         modification.put_rel_page_image(rel, blknum, img)?;
         Ok(())
     }
 
-    fn put_rel_wal_record(
+    async fn put_rel_wal_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)?;
+    ) -> anyhow::Result<()> {
+        self.handle_rel_extend(modification, rel, blknum).await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -946,7 +979,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification,
         rel: RelTag,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         modification.put_rel_truncation(rel, nblocks)?;
         Ok(())
     }
@@ -956,33 +989,37 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result<BlockNumber> {
-        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? {
+    async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
+        let exists =
+            with_ondemand_download(|| self.timeline.get_rel_exists(rel, lsn, true)).await?;
+        let nblocks = if !exists {
             0
         } else {
-            self.timeline.get_rel_size(rel, lsn, true)?
+            with_ondemand_download(|| self.timeline.get_rel_size(rel, lsn, true)).await?
         };
         Ok(nblocks)
     }
 
-    fn handle_rel_extend(
+    async fn handle_rel_extend(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? {
-            // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0)?;
-            0
-        } else {
-            self.timeline.get_rel_size(rel, last_lsn, true)?
-        };
+        let old_nblocks =
+            if !with_ondemand_download(|| self.timeline.get_rel_exists(rel, last_lsn, true)).await?
+            {
+                // create it with 0 size initially, the logic below will extend it
+                modification.put_rel_creation(rel, 0)?;
+                0
+            } else {
+                with_ondemand_download(|| self.timeline.get_rel_size(rel, last_lsn, true)).await?
+            };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
@@ -996,26 +1033,27 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn put_slru_page_image(
+    async fn put_slru_page_image(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
-        self.handle_slru_extend(modification, kind, segno, blknum)?;
+    ) -> anyhow::Result<()> {
+        self.handle_slru_extend(modification, kind, segno, blknum)
+            .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;
         Ok(())
     }
 
-    fn handle_slru_extend(
+    async fn handle_slru_extend(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
         // a lot less frequently.
@@ -1025,15 +1063,17 @@ impl<'a> WalIngest<'a> {
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = self.timeline.get_last_record_lsn();
-        let old_nblocks = if !self
-            .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)?
+        let old_nblocks = if !with_ondemand_download(|| {
+            self.timeline.get_slru_segment_exists(kind, segno, last_lsn)
+        })
+        .await?
         {
             // create it with 0 size initially, the logic below will extend it
             modification.put_slru_segment_creation(kind, segno, 0)?;
             0
         } else {
-            self.timeline.get_slru_segment_size(kind, segno, last_lsn)?
+            with_ondemand_download(|| self.timeline.get_slru_segment_size(kind, segno, last_lsn))
+                .await?
         };
 
         if new_nblocks > old_nblocks {
@@ -1081,12 +1121,12 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
+    async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10))?;
+        let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
 
         Ok(walingest)
     }
@@ -1095,62 +1135,107 @@ mod tests {
     async fn test_relsize() -> Result<()> {
         let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A)?;
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
+            .await?;
         m.commit()?;
 
-        assert_current_logical_size(&*tline, Lsn(0x50));
+        assert_current_logical_size(&tline, Lsn(0x50));
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false);
-        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err());
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false)
+                .no_ondemand_download()?,
+            false
+        );
+        assert!(tline
+            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .no_ondemand_download()
+            .is_err());
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            1
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            3
+        );
 
         // Check page contents at each LSN
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 2 at 5")
         );
 
@@ -1158,23 +1243,39 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x60));
         walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
         m.commit()?;
-        assert_current_logical_size(&*tline, Lsn(0x60));
+        assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false)
+                .no_ondemand_download()?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            3
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 2 at 5")
         );
 
@@ -1182,35 +1283,62 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x68));
         walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x68), false)
+                .no_ondemand_download()?,
+            0
+        );
 
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
+            .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x70), false)
+                .no_ondemand_download()?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
+                .no_ondemand_download()?,
             ZERO_PAGE
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1")
         );
 
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
+            .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            1501
+        );
         for blk in 2..1500 {
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
+                    .no_ondemand_download()?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1500")
         );
 
@@ -1223,15 +1351,27 @@ mod tests {
     async fn test_drop_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
@@ -1239,19 +1379,36 @@ mod tests {
         m.commit()?;
 
         // Check that rel is not visible anymore
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false)
+                .no_ondemand_download()?,
+            false
+        );
 
         // FIXME: should fail
         //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none());
 
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
+            .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x40), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         Ok(())
     }
@@ -1263,30 +1420,52 @@ mod tests {
     async fn test_truncate_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
         let mut m = tline.begin_modification(Lsn(0x20));
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .await?;
         }
         m.commit()?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false);
-        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err());
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false)
+                .no_ondemand_download()?,
+            false
+        );
+        assert!(tline
+            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .no_ondemand_download()
+            .is_err());
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            relsize
+        );
 
         // Check relation content
         for blkno in 0..relsize {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1298,24 +1477,38 @@ mod tests {
         m.commit()?;
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         for blkno in 0..1 {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
 
         // should still see all blocks with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            relsize
+        );
         for blkno in 0..relsize {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1326,18 +1519,32 @@ mod tests {
         let mut m = tline.begin_modification(lsn);
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .await?;
         }
         m.commit()?;
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            relsize
+        );
         // Check relation content
         for blkno in 0..relsize {
             let lsn = Lsn(0x80);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1351,21 +1558,25 @@ mod tests {
     async fn test_large_rel() -> Result<()> {
         let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
+                .await?;
             m.commit()?;
         }
 
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
             RELSEG_SIZE + 1
         );
 
@@ -1374,8 +1585,13 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE);
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
+            RELSEG_SIZE
+        );
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         // Truncate another block
         lsn += 0x10;
@@ -1383,10 +1599,12 @@ mod tests {
         walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
             RELSEG_SIZE - 1
         );
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         // Truncate to 1500, and then truncate all the way down to 0, one block at a time
         // This tests the behavior at segment boundaries
@@ -1397,13 +1615,15 @@ mod tests {
             walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
             m.commit()?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+                tline
+                    .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                    .no_ondemand_download()?,
                 size as BlockNumber
             );
 
             size -= 1;
         }
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         Ok(())
     }
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs
index e627e9ecd0..aaf46579a7 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -44,10 +44,13 @@ pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result
     let broker_endpoint = conf.broker_endpoint.clone();
 
     // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client = storage_broker::connect(broker_endpoint.clone()).context(format!(
-        "Failed to create broker client to {}",
-        &conf.broker_endpoint
-    ))?;
+    let broker_client =
+        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+            format!(
+                "Failed to create broker client to {}",
+                &conf.broker_endpoint
+            ),
+        )?;
 
     if BROKER_CLIENT.set(broker_client).is_err() {
         panic!("broker already initialized");
@@ -126,15 +129,21 @@ impl<E: Clone> TaskHandle<E> {
         match self.events_receiver.changed().await {
             Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
             Err(_task_channel_part_dropped) => {
-                TaskEvent::End(match self.join_handle.take() {
+                TaskEvent::End(match self.join_handle.as_mut() {
                     Some(jh) => {
                         if !jh.is_finished() {
                             warn!("sender is dropped while join handle is still alive");
                         }
 
-                        jh.await
+                        let res = jh
+                            .await
                             .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
-                            .and_then(|x| x)
+                            .and_then(|x| x);
+
+                        // For cancellation-safety, drop join_handle only after successful .await.
+                        self.join_handle = None;
+
+                        res
                     }
                     None => {
                         // Another option is to have an enum, join handle or result and give away the reference to it
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index 8048707480..8b60e59305 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -145,21 +145,17 @@ async fn connection_manager_loop_step(
                 let wal_connection = walreceiver_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
-                    TaskEvent::Update(c) => {
-                        match c {
-                            TaskStateUpdate::Init | TaskStateUpdate::Started => {},
-                            TaskStateUpdate::Progress(status) => {
-                                if status.has_processed_wal {
-                                    // We have advanced last_record_lsn by processing the WAL received
-                                    // from this safekeeper. This is good enough to clean unsuccessful
-                                    // retries history and allow reconnecting to this safekeeper without
-                                    // sleeping for a long time.
-                                    walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
-                                }
-                                wal_connection.status = status.to_owned();
-                            }
+                    TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
+                    TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
+                        if new_status.has_processed_wal {
+                            // We have advanced last_record_lsn by processing the WAL received
+                            // from this safekeeper. This is good enough to clean unsuccessful
+                            // retries history and allow reconnecting to this safekeeper without
+                            // sleeping for a long time.
+                            walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
                         }
-                    },
+                        wal_connection.status = new_status;
+                    }
                     TaskEvent::End(walreceiver_task_result) => {
                         match walreceiver_task_result {
                             Ok(()) => debug!("WAL receiving task finished"),
@@ -210,7 +206,18 @@ async fn connection_manager_loop_step(
                 }
             },
 
-            _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
+            Some(()) = async {
+                match time_until_next_retry {
+                    Some(sleep_time) => {
+                        tokio::time::sleep(sleep_time).await;
+                        Some(())
+                    },
+                    None => {
+                        debug!("No candidates to retry, waiting indefinitely for the broker events");
+                        None
+                    }
+                }
+            } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
         }
 
         if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
@@ -400,7 +407,7 @@ impl WalreceiverState {
                 .await
                 .context("walreceiver connection handling failure")
             }
-            .instrument(info_span!("walreceiver_connection", id = %id))
+            .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
         });
 
         let now = Utc::now().naive_utc();
@@ -480,20 +487,25 @@ impl WalreceiverState {
             .values()
             .filter_map(|retry| retry.next_retry_at)
             .filter(|next_retry_at| next_retry_at > &now)
-            .min();
+            .min()?;
 
-        next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
+        (next_retry_at - now).to_std().ok()
     }
 
     /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
     fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
-        self.wal_stream_candidates.insert(
-            NodeId(timeline_update.safekeeper_id),
+        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
+        let old_entry = self.wal_stream_candidates.insert(
+            new_safekeeper_id,
             BrokerSkTimeline {
                 timeline: timeline_update,
                 latest_update: Utc::now().naive_utc(),
             },
         );
+
+        if old_entry.is_none() {
+            info!("New SK node was added: {new_safekeeper_id}");
+        }
     }
 
     /// Cleans up stale broker records and checks the rest for the new connection candidate.
@@ -720,12 +732,13 @@ impl WalreceiverState {
     /// Remove candidates which haven't sent broker updates for a while.
     fn cleanup_old_candidates(&mut self) {
         let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
+        let lagging_wal_timeout = self.lagging_wal_timeout;
 
         self.wal_stream_candidates.retain(|node_id, broker_info| {
             if let Ok(time_since_latest_broker_update) =
                 (Utc::now().naive_utc() - broker_info.latest_update).to_std()
             {
-                let should_retain = time_since_latest_broker_update < self.lagging_wal_timeout;
+                let should_retain = time_since_latest_broker_update < lagging_wal_timeout;
                 if !should_retain {
                     node_ids_to_remove.push(*node_id);
                 }
@@ -735,8 +748,11 @@ impl WalreceiverState {
             }
         });
 
-        for node_id in node_ids_to_remove {
-            self.wal_connection_retries.remove(&node_id);
+        if !node_ids_to_remove.is_empty() {
+            for node_id in node_ids_to_remove {
+                info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
+                self.wal_connection_retries.remove(&node_id);
+            }
         }
     }
 
@@ -789,7 +805,7 @@ fn wal_stream_connection_config(
     auth_token: Option<&str>,
 ) -> anyhow::Result<PgConnectionConfig> {
     let (host, port) =
-        parse_host_port(&listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
     let port = port.unwrap_or(5432);
     Ok(PgConnectionConfig::new_host_port(host, port)
         .extend_options([
@@ -883,10 +899,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
@@ -1045,10 +1061,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
@@ -1110,10 +1126,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: NodeId(1),
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index cf2a99f1b5..aca5e8e019 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -1,6 +1,7 @@
 //! Actual Postgres connection handler to stream WAL to the server.
 
 use std::{
+    error::Error,
     str::FromStr,
     sync::Arc,
     time::{Duration, SystemTime},
@@ -11,7 +12,7 @@ use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
-use postgres::{SimpleQueryMessage, SimpleQueryRow};
+use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_protocol::message::backend::ReplicationMessage;
@@ -32,10 +33,10 @@ use crate::{
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use pq_proto::ReplicationFeedback;
-use utils::lsn::Lsn;
+use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error};
 
 /// Status of the connection.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub struct WalConnectionStatus {
     /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
     pub is_connected: bool,
@@ -68,10 +69,17 @@ pub async fn handle_walreceiver_connection(
         let mut config = wal_source_connconf.to_tokio_postgres_config();
         config.application_name("pageserver");
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
-        time::timeout(connect_timeout, config.connect(postgres::NoTls))
-            .await
-            .context("Timed out while waiting for walreceiver connection to open")?
-            .context("Failed to open walreceiver connection")?
+        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
+            Ok(Ok(client_and_conn)) => client_and_conn,
+            Ok(Err(conn_err)) => {
+                let expected_error = ignore_expected_errors(conn_err)?;
+                info!("DB connection stream finished: {expected_error}");
+                return Ok(());
+            }
+            Err(elapsed) => anyhow::bail!(
+                "Timed out while waiting {elapsed} for walreceiver connection to open"
+            ),
+        }
     };
 
     info!("connected!");
@@ -83,7 +91,7 @@ pub async fn handle_walreceiver_connection(
         streaming_lsn: None,
         commit_lsn: None,
     };
-    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
         return Ok(());
     }
@@ -103,10 +111,8 @@ pub async fn handle_walreceiver_connection(
                 connection_result = connection => match connection_result{
                     Ok(()) => info!("Walreceiver db connection closed"),
                     Err(connection_error) => {
-                        if connection_error.is_closed() {
-                            info!("Connection closed regularly: {connection_error}")
-                        } else {
-                            warn!("Connection aborted: {connection_error}")
+                        if let Err(e) = ignore_expected_errors(connection_error) {
+                            warn!("Connection aborted: {e:#}")
                         }
                     }
                 },
@@ -135,7 +141,7 @@ pub async fn handle_walreceiver_connection(
     connection_status.latest_connection_update = Utc::now().naive_utc();
     connection_status.latest_wal_update = Utc::now().naive_utc();
     connection_status.commit_lsn = Some(end_of_wal);
-    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
         return Ok(());
     }
@@ -173,7 +179,7 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
 
     while let Some(replication_message) = {
         select! {
@@ -184,7 +190,15 @@ pub async fn handle_walreceiver_connection(
             replication_message = physical_stream.next() => replication_message,
         }
     } {
-        let replication_message = replication_message?;
+        let replication_message = match replication_message {
+            Ok(message) => message,
+            Err(replication_error) => {
+                let expected_error = ignore_expected_errors(replication_error)?;
+                info!("Replication stream finished: {expected_error}");
+                return Ok(());
+            }
+        };
+
         let now = Utc::now().naive_utc();
         let last_rec_lsn_before_msg = last_rec_lsn;
 
@@ -207,7 +221,7 @@ pub async fn handle_walreceiver_connection(
             }
             &_ => {}
         };
-        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
             warn!("Wal connection event listener dropped, aborting the connection: {e}");
             return Ok(());
         }
@@ -236,8 +250,9 @@ pub async fn handle_walreceiver_connection(
                         ensure!(lsn.is_aligned());
 
                         walingest
-                            .ingest_record(recdata, lsn, &mut modification, &mut decoded)
-                            .context("could not ingest record at {lsn}")?;
+                            .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
+                            .await
+                            .with_context(|| format!("could not ingest record at {lsn}"))?;
 
                         fail_point!("walreceiver-after-ingest");
 
@@ -273,8 +288,7 @@ pub async fn handle_walreceiver_connection(
         if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
             // We have successfully processed at least one WAL record.
             connection_status.has_processed_wal = true;
-            if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone()))
-            {
+            if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
                 warn!("Wal connection event listener dropped, aborting the connection: {e}");
                 return Ok(());
             }
@@ -313,10 +327,11 @@ pub async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
+            let (timeline_logical_size, _) = timeline
+                .get_current_logical_size()
+                .context("Status update creation failed to get current logical size")?;
             let status_update = ReplicationFeedback {
-                current_timeline_size: timeline
-                    .get_current_logical_size()
-                    .context("Status update creation failed to get current logical size")?,
+                current_timeline_size: timeline_logical_size,
                 ps_writelsn: write_lsn,
                 ps_flushlsn: flush_lsn,
                 ps_applylsn: apply_lsn,
@@ -386,3 +401,32 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
         Err(IdentifyError.into())
     }
 }
+
+/// We don't want to report connectivity problems as real errors towards connection manager because
+/// 1. they happen frequently enough to make server logs hard to read and
+/// 2. the connection manager can retry other safekeeper.
+///
+/// If this function returns `Ok(pg_error)`, it's such an error.
+/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+/// Connection manager will then handle reconnections.
+///
+/// If this function returns an `Err()`, the caller can bubble it up using `?`.
+/// The connection manager will log the error at ERROR level.
+fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
+    if pg_error.is_closed()
+        || pg_error
+            .source()
+            .and_then(|source| source.downcast_ref::<std::io::Error>())
+            .map(is_expected_io_error)
+            .unwrap_or(false)
+    {
+        return Ok(pg_error);
+    } else if let Some(db_error) = pg_error.as_db_error() {
+        if db_error.code() == &SqlState::CONNECTION_FAILURE
+            && db_error.message().contains("end streaming")
+        {
+            return Ok(pg_error);
+        }
+    }
+    Err(pg_error).context("connection error")
+}
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 38fb9a4247..7581140934 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -1,6 +1,7 @@
 //!
 //! Functions for parsing WAL records.
 //!
+
 use anyhow::Result;
 use bytes::{Buf, Bytes};
 use postgres_ffi::pg_constants;
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index ca7cfb7413..7cf489562b 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -409,7 +409,7 @@ impl PostgresRedoManager {
                     key
                 );
                 for &xid in xids {
-                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
                     let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
 
@@ -459,7 +459,7 @@ impl PostgresRedoManager {
                     key
                 );
                 for &xid in xids {
-                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
                     let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
 
@@ -647,7 +647,7 @@ impl PostgresRedoProcess {
 
         info!("running initdb in {}", datadir.display());
         let initdb = Command::new(pg_bin_dir_path.join("initdb"))
-            .args(&["-D", &datadir.to_string_lossy()])
+            .args(["-D", &datadir.to_string_lossy()])
             .arg("-N")
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 7f4e30a12e..ec377dbb1e 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,11 +4,12 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
+	neon.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
-	neon.o \
 	walproposer.o \
 	walproposer_utils.o
 
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
new file mode 100644
index 0000000000..96c2461e2d
--- /dev/null
+++ b/pgxn/neon/file_cache.c
@@ -0,0 +1,597 @@
+/*
+ *
+ * file_cache.c
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  pgxn/neon/file_cache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/file.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "pagestore_client.h"
+#include "access/parallel.h"
+#include "postmaster/bgworker.h"
+#include "storage/relfilenode.h"
+#include "storage/buf_internals.h"
+#include "storage/latch.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+#include "storage/fd.h"
+#include "storage/pg_shmem.h"
+#include "storage/buf_internals.h"
+
+/*
+ * Local file cache is used to temporary store relations pages in local file system.
+ * All blocks of all relations are stored inside one file and addressed using shared hash map.
+ * Currently LRU eviction policy based on L2 list is used as replacement algorithm.
+ * As far as manipulation of L2-list requires global critical section, we are not using partitioned hash.
+ * Also we are using exclusive lock even for read operation because LRU requires relinking element in L2 list.
+ * If this lock become a bottleneck, we can consider other eviction strategies, for example clock algorithm.
+ *
+ * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
+ * its consistency.
+ */
+
+/* Local file storage allocation chunk.
+ * Should be power of two and not less than 32. Using larger than page chunks can
+ * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
+ *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
+ *    1Mb chunks can reduce hash map size to 320Mb.
+ * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
+ */
+#define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
+#define MB					((uint64)1024*1024)
+
+#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
+
+typedef struct FileCacheEntry
+{
+	BufferTag	key;
+	uint32		offset;
+	uint32		access_count;
+	uint32		bitmap[BLOCKS_PER_CHUNK/32];
+	dlist_node	lru_node; /* LRU list node */
+} FileCacheEntry;
+
+typedef struct FileCacheControl
+{
+	uint32 size; /* size of cache file in chunks */
+	dlist_head lru; /* double linked list for LRU replacement algorithm */
+} FileCacheControl;
+
+static HTAB* lfc_hash;
+static int   lfc_desc;
+static LWLockId lfc_lock;
+static int   lfc_max_size;
+static int   lfc_size_limit;
+static char* lfc_path;
+static  FileCacheControl* lfc_ctl;
+static shmem_startup_hook_type prev_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
+#endif
+
+static void
+lfc_shmem_startup(void)
+{
+	bool found;
+	static HASHCTL info;
+
+	if (prev_shmem_startup_hook)
+	{
+		prev_shmem_startup_hook();
+	}
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	if (!found)
+	{
+		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
+		info.keysize = sizeof(BufferTag);
+		info.entrysize = sizeof(FileCacheEntry);
+		lfc_hash = ShmemInitHash("lfc_hash",
+								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
+								 lfc_size+1, lfc_size+1,
+								 &info,
+								 HASH_ELEM | HASH_BLOBS);
+		lfc_ctl->size = 0;
+		dlist_init(&lfc_ctl->lru);
+
+		/* Remove file cache on restart */
+		(void)unlink(lfc_path);
+	}
+	LWLockRelease(AddinShmemInitLock);
+}
+
+static void
+lfc_shmem_request(void)
+{
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
+	RequestNamedLWLockTranche("lfc_lock", 1);
+}
+
+bool
+lfc_check_limit_hook(int *newval, void **extra, GucSource source)
+{
+	if (*newval > lfc_max_size)
+	{
+		elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
+		return false;
+	}
+	return true;
+}
+
+void
+lfc_change_limit_hook(int newval, void *extra)
+{
+	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
+	/*
+	 * Stats collector detach shared memory, so we should not try to access shared memory here.
+	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 */
+	if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
+		return;
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+			return;
+		}
+	}
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru))
+	{
+		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
+		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		Assert(victim->access_count == 0);
+#ifdef FALLOC_FL_PUNCH_HOLE
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
+			elog(LOG, "Failed to punch hole in file: %m");
+#endif
+		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
+		lfc_ctl->size -= 1;
+	}
+	elog(LOG, "set local file cache limit to %d", new_size);
+	LWLockRelease(lfc_lock);
+}
+
+void
+lfc_init(void)
+{
+	/*
+	 * In order to create our shared memory area, we have to be loaded via
+	 * shared_preload_libraries.
+	 */
+	if (!process_shared_preload_libraries_in_progress)
+		elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
+
+	DefineCustomIntVariable("neon.max_file_cache_size",
+							"Maximal size of Neon local file cache",
+							NULL,
+							&lfc_max_size,
+							0, /* disabled by default */
+							0,
+							INT_MAX,
+							PGC_POSTMASTER,
+							GUC_UNIT_MB,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("neon.file_cache_size_limit",
+							"Current limit for size of Neon local file cache",
+							NULL,
+							&lfc_size_limit,
+							0, /* disabled by default */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_MB,
+							NULL,
+							lfc_change_limit_hook,
+							NULL);
+
+	DefineCustomStringVariable("neon.file_cache_path",
+							   "Path to local file cache (can be raw device)",
+							   NULL,
+							   &lfc_path,
+							   "file.cache",
+							   PGC_POSTMASTER,
+							   0,
+							   NULL,
+							   NULL,
+							   NULL);
+
+	if (lfc_max_size == 0)
+		return;
+
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = lfc_shmem_startup;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = lfc_shmem_request;
+#else
+	lfc_shmem_request();
+#endif
+}
+
+/*
+ * Check if page is present in the cache.
+ * Returns true if page is found in local cache.
+ */
+bool
+lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool found;
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return false;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+	found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0;
+	LWLockRelease(lfc_lock);
+	return found;
+}
+
+/*
+ * Try to read page from local cache.
+ * Returns true if page is found in local cache.
+ * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache.
+ */
+bool
+lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+		 char *buffer)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool result = true;
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return false;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
+	{
+		/* Page is not cached */
+		LWLockRelease(lfc_lock);
+		return false;
+	}
+	/* Unlink entry from LRU list to pin it for the duration of IO operation */
+	if (entry->access_count++ == 0)
+		dlist_delete(&entry->lru_node);
+	LWLockRelease(lfc_lock);
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+			result = false;
+		}
+	}
+
+	if (lfc_desc > 0)
+	{
+		rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+		if (rc != BLCKSZ)
+		{
+			elog(INFO, "Failed to read file cache: %m");
+			lfc_size_limit = 0; /* disable file cache */
+			result = false;
+		}
+	}
+
+	/* Place entry to the head of LRU list */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	Assert(entry->access_count > 0);
+	if (--entry->access_count == 0)
+		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	LWLockRelease(lfc_lock);
+
+	return result;
+}
+
+/*
+ * Put page in local file cache.
+ * If cache is full then evict some other page.
+ */
+void
+lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+		  char *buffer)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+
+	if (found)
+	{
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
+		if (entry->access_count++ == 0)
+			dlist_delete(&entry->lru_node);
+	}
+	else
+	{
+		/*
+		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
+		 * 1. Wait until some of this operation is completed and pages is unpinned
+		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
+		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
+		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
+		 * we prefer not to complicate code and use second approach.
+		 */
+		if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
+		{
+			/* Cache overflow: evict least recently used chunk */
+			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			Assert(victim->access_count == 0);
+			entry->offset = victim->offset; /* grab victim's chunk */
+			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
+			elog(LOG, "Swap file cache page");
+		}
+		else
+			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
+		entry->access_count = 1;
+		memset(entry->bitmap, 0, sizeof entry->bitmap);
+	}
+	LWLockRelease(lfc_lock);
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+		}
+	}
+	if (lfc_desc > 0)
+	{
+		rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+		if (rc != BLCKSZ)
+		{
+			elog(INFO, "Failed to write file cache: %m");
+			lfc_size_limit = 0; /* disable file cache */
+		}
+	}
+	/* Place entry to the head of LRU list */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	Assert(entry->access_count > 0);
+	if (--entry->access_count == 0)
+		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	if (lfc_size_limit != 0)
+		entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
+	LWLockRelease(lfc_lock);
+}
+
+
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+	uint32		pageoffs;
+	Oid			relfilenode;
+	Oid			reltablespace;
+	Oid			reldatabase;
+	ForkNumber	forknum;
+	BlockNumber blocknum;
+	uint16		accesscount;
+} LocalCachePagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+	TupleDesc	tupdesc;
+	LocalCachePagesRec *record;
+} LocalCachePagesContext;
+
+/*
+ * Function returning data from the local file cache
+ * relation node/tablespace/database/blocknum and access_counter
+ */
+PG_FUNCTION_INFO_V1(local_cache_pages);
+
+#define NUM_LOCALCACHE_PAGES_ELEM	7
+
+Datum
+local_cache_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	Datum		result;
+	MemoryContext oldcontext;
+	LocalCachePagesContext *fctx;	/* User function context. */
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+        HASH_SEQ_STATUS status;
+		FileCacheEntry* entry;
+		uint32 n_pages = 0;
+		uint32 i;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* Create a user function context for cross-call persistence */
+		fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext));
+
+		/*
+		 * To smoothly support upgrades from version 1.0 of this extension
+		 * transparently handle the (non-)existence of the pinning_backends
+		 * column. We unfortunately have to get the result type for that... -
+		 * we can't use the result type determined by the function definition
+		 * without potentially crashing when somebody uses the old (or even
+		 * wrong) function definition though.
+		 */
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
+						   INT2OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount",
+						   INT4OID, -1, 0);
+
+		fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+		LWLockAcquire(lfc_lock, LW_SHARED);
+
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+				n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
+		}
+		fctx->record = (LocalCachePagesRec *)
+			MemoryContextAllocHuge(CurrentMemoryContext,
+								   sizeof(LocalCachePagesRec) * n_pages);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = n_pages;
+		funcctx->user_fctx = fctx;
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		/*
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
+		 */
+		n_pages = 0;
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+			{
+				if (entry->bitmap[i >> 5] & (1 << (i & 31)))
+				{
+					fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
+					fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
+					fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
+					fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
+					fctx->record[n_pages].forknum = entry->key.forkNum;
+					fctx->record[n_pages].blocknum = entry->key.blockNum + i;
+					fctx->record[n_pages].accesscount = entry->access_count;
+					n_pages += 1;
+				}
+			}
+		}
+		Assert(n_pages == funcctx->max_calls);
+		LWLockRelease(lfc_lock);
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+		Datum		values[NUM_LOCALCACHE_PAGES_ELEM];
+		bool		nulls[NUM_LOCALCACHE_PAGES_ELEM] = {
+			false, false, false, false, false, false, false
+		};
+
+		values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs);
+		values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode);
+		values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
+		values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
+		values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
+		values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
+		values[6] = Int32GetDatum(fctx->record[i].accesscount);
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 1aba2e1ede..88e3a12d96 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -111,6 +111,7 @@ pageserver_connect()
 				PQfinish(pageserver_conn);
 				pageserver_conn = NULL;
 				FreeWaitEventSet(pageserver_conn_wes);
+				pageserver_conn_wes = NULL;
 
 				neon_log(ERROR, "could not complete handshake with pageserver: %s",
 						 msg);
@@ -179,7 +180,10 @@ pageserver_disconnect(void)
 		prefetch_on_ps_disconnect();
 	}
 	if (pageserver_conn_wes != NULL)
+	{
 		FreeWaitEventSet(pageserver_conn_wes);
+		pageserver_conn_wes = NULL;
+	}
 }
 
 static void
@@ -206,7 +210,7 @@ pageserver_send(NeonRequest * request)
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
-		char	   *msg = PQerrorMessage(pageserver_conn);
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		pageserver_disconnect();
 		neon_log(ERROR, "failed to send page request: %s", msg);
@@ -239,29 +243,33 @@ pageserver_receive(void)
 	PG_TRY();
 	{
 		/* read response */
-		resp_buff.len = call_PQgetCopyData(&resp_buff.data);
-		resp_buff.cursor = 0;
+		int			rc;
 
-		if (resp_buff.len < 0)
+		rc = call_PQgetCopyData(&resp_buff.data);
+		if (rc >= 0)
 		{
-			if (resp_buff.len == -1)
+			resp_buff.len = rc;
+			resp_buff.cursor = 0;
+			resp = nm_unpack_response(&resp_buff);
+			PQfreemem(resp_buff.data);
+
+			if (message_level_is_interesting(PageStoreTrace))
 			{
-				pageserver_disconnect();
-				return NULL;
+				char	   *msg = nm_to_string((NeonMessage *) resp);
+
+				neon_log(PageStoreTrace, "got response: %s", msg);
+				pfree(msg);
 			}
-			else if (resp_buff.len == -2)
-				neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 		}
-		resp = nm_unpack_response(&resp_buff);
-		PQfreemem(resp_buff.data);
-
-		if (message_level_is_interesting(PageStoreTrace))
+		else if (rc == -1)
 		{
-			char	   *msg = nm_to_string((NeonMessage *) resp);
-
-			neon_log(PageStoreTrace, "got response: %s", msg);
-			pfree(msg);
+			pageserver_disconnect();
+			resp = NULL;
 		}
+		else if (rc == -2)
+			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+		else
+			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
 	PG_CATCH();
 	{
@@ -420,7 +428,7 @@ pg_init_libpagestore(void)
 							   NULL, NULL, NULL);
 
     DefineCustomStringVariable("neon.safekeeper_token_env",
-                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $ZENITH_AUTH_TOKEN",
+                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
                                NULL,
                                &safekeeper_token_env,
                                NULL,
@@ -516,4 +524,5 @@ pg_init_libpagestore(void)
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
 	}
+	lfc_init();
 }
diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql
index 58b98a5923..6cf111ea6a 100644
--- a/pgxn/neon/neon--1.0.sql
+++ b/pgxn/neon/neon--1.0.sql
@@ -22,3 +22,13 @@ AS 'MODULE_PATHNAME', 'backpressure_throttling_time'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
+CREATE FUNCTION local_cache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'local_cache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW local_cache AS
+	SELECT P.* FROM local_cache_pages() AS P
+	(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, accesscount int4);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 170a0cb72d..831756b849 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -203,4 +203,11 @@ extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumbe
 extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 
+/* functions for local file cache */
+extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_init(void);
+
+
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 73bf330baf..0b34cb3ca9 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1669,7 +1669,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
 	 * call smgrextend for destination relation n using size of source relation
 	 */
-	get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks);
+	n_blocks = neon_nblocks(reln, forkNum);
 	while (n_blocks < blkno)
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
 
@@ -1684,6 +1684,8 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
+	lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
@@ -1757,6 +1759,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum))
+		return false;
+
 	tag = (BufferTag) {
 		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
@@ -1899,6 +1904,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case T_NeonGetPageResponse:
 			memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
+			lfc_write(rnode, forkNum, blkno, buffer);
 			break;
 
 		case T_NeonErrorResponse:
@@ -1950,6 +1956,12 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	/* Try to read from local file cache */
+	if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
+	{
+		return;
+	}
+
 	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno);
 	neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
@@ -2111,6 +2123,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
+	lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
diff --git a/poetry.lock b/poetry.lock
index 2fa7f03679..edbcddd576 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -941,11 +941,11 @@ xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
 [[package]]
 name = "mypy"
-version = "0.971"
+version = "0.991"
 description = "Optional static typing for Python"
 category = "dev"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 
 [package.dependencies]
 mypy-extensions = ">=0.4.3"
@@ -954,6 +954,7 @@ typing-extensions = ">=3.10"
 
 [package.extras]
 dmypy = ["psutil (>=4.0)"]
+install-types = ["pip"]
 python2 = ["typed-ast (>=1.4.0,<2)"]
 reports = ["lxml"]
 
@@ -1227,6 +1228,17 @@ pytest = ">=6.1.0"
 [package.extras]
 testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
 
+[[package]]
+name = "pytest-httpserver"
+version = "1.0.6"
+description = "pytest-httpserver is a httpserver for pytest"
+category = "main"
+optional = false
+python-versions = ">=3.7,<4.0"
+
+[package.dependencies]
+Werkzeug = ">=2.0.0"
+
 [[package]]
 name = "pytest-lazy-fixture"
 version = "0.6.3"
@@ -1406,7 +1418,7 @@ pbr = "*"
 
 [[package]]
 name = "setuptools"
-version = "65.5.0"
+version = "65.5.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 category = "main"
 optional = false
@@ -1414,7 +1426,7 @@ python-versions = ">=3.7"
 
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
@@ -1583,7 +1595,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "98d63eaa73253882440e0fc8cdb305bb536944768c5ba313c25d0ee65f546544"
+content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70"
 
 [metadata.files]
 aiopg = [
@@ -1949,29 +1961,36 @@ moto = [
     {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
 ]
 mypy = [
-    {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"},
-    {file = "mypy-0.971-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:98e02d56ebe93981c41211c05adb630d1d26c14195d04d95e49cd97dbc046dc5"},
-    {file = "mypy-0.971-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:19830b7dba7d5356d3e26e2427a2ec91c994cd92d983142cbd025ebe81d69cf3"},
-    {file = "mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02ef476f6dcb86e6f502ae39a16b93285fef97e7f1ff22932b657d1ef1f28655"},
-    {file = "mypy-0.971-cp310-cp310-win_amd64.whl", hash = "sha256:25c5750ba5609a0c7550b73a33deb314ecfb559c350bb050b655505e8aed4103"},
-    {file = "mypy-0.971-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d3348e7eb2eea2472db611486846742d5d52d1290576de99d59edeb7cd4a42ca"},
-    {file = "mypy-0.971-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3fa7a477b9900be9b7dd4bab30a12759e5abe9586574ceb944bc29cddf8f0417"},
-    {file = "mypy-0.971-cp36-cp36m-win_amd64.whl", hash = "sha256:2ad53cf9c3adc43cf3bea0a7d01a2f2e86db9fe7596dfecb4496a5dda63cbb09"},
-    {file = "mypy-0.971-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:855048b6feb6dfe09d3353466004490b1872887150c5bb5caad7838b57328cc8"},
-    {file = "mypy-0.971-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:23488a14a83bca6e54402c2e6435467a4138785df93ec85aeff64c6170077fb0"},
-    {file = "mypy-0.971-cp37-cp37m-win_amd64.whl", hash = "sha256:4b21e5b1a70dfb972490035128f305c39bc4bc253f34e96a4adf9127cf943eb2"},
-    {file = "mypy-0.971-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9796a2ba7b4b538649caa5cecd398d873f4022ed2333ffde58eaf604c4d2cb27"},
-    {file = "mypy-0.971-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a361d92635ad4ada1b1b2d3630fc2f53f2127d51cf2def9db83cba32e47c856"},
-    {file = "mypy-0.971-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b793b899f7cf563b1e7044a5c97361196b938e92f0a4343a5d27966a53d2ec71"},
-    {file = "mypy-0.971-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d1ea5d12c8e2d266b5fb8c7a5d2e9c0219fedfeb493b7ed60cd350322384ac27"},
-    {file = "mypy-0.971-cp38-cp38-win_amd64.whl", hash = "sha256:23c7ff43fff4b0df93a186581885c8512bc50fc4d4910e0f838e35d6bb6b5e58"},
-    {file = "mypy-0.971-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1f7656b69974a6933e987ee8ffb951d836272d6c0f81d727f1d0e2696074d9e6"},
-    {file = "mypy-0.971-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2022bfadb7a5c2ef410d6a7c9763188afdb7f3533f22a0a32be10d571ee4bbe"},
-    {file = "mypy-0.971-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef943c72a786b0f8d90fd76e9b39ce81fb7171172daf84bf43eaf937e9f220a9"},
-    {file = "mypy-0.971-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d744f72eb39f69312bc6c2abf8ff6656973120e2eb3f3ec4f758ed47e414a4bf"},
-    {file = "mypy-0.971-cp39-cp39-win_amd64.whl", hash = "sha256:77a514ea15d3007d33a9e2157b0ba9c267496acf12a7f2b9b9f8446337aac5b0"},
-    {file = "mypy-0.971-py3-none-any.whl", hash = "sha256:0d054ef16b071149917085f51f89555a576e2618d5d9dd70bd6eea6410af3ac9"},
-    {file = "mypy-0.971.tar.gz", hash = "sha256:40b0f21484238269ae6a57200c807d80debc6459d444c0489a102d7c6a75fa56"},
+    {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
+    {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
+    {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"},
+    {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"},
+    {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"},
+    {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"},
+    {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"},
+    {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"},
+    {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"},
+    {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"},
+    {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"},
+    {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"},
+    {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"},
+    {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"},
+    {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"},
+    {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"},
+    {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"},
+    {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"},
+    {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"},
+    {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"},
+    {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"},
+    {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"},
+    {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"},
+    {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"},
+    {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"},
+    {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"},
+    {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"},
+    {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"},
+    {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"},
+    {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"},
 ]
 mypy-boto3-s3 = [
     {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"},
@@ -2157,6 +2176,10 @@ pytest-asyncio = [
     {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"},
     {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"},
 ]
+pytest-httpserver = [
+    {file = "pytest_httpserver-1.0.6-py3-none-any.whl", hash = "sha256:ac2379acc91fe8bdbe2911c93af8dd130e33b5899fb9934d15669480739c6d32"},
+    {file = "pytest_httpserver-1.0.6.tar.gz", hash = "sha256:9040d07bf59ac45d8de3db1d4468fd2d1d607975e4da4c872ecc0402cdbf7b3e"},
+]
 pytest-lazy-fixture = [
     {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
     {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
@@ -2260,8 +2283,8 @@ sarif-om = [
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
 setuptools = [
-    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
-    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+    {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
+    {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
 ]
 six = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 14a5450d5e..cbc067093e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -2,6 +2,7 @@
 name = "proxy"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
@@ -16,12 +17,14 @@ hashbrown = "0.12"
 hex = "0.4.3"
 hmac = "0.12.1"
 hyper = "0.14"
+hyper-tungstenite = "0.8.1"
 itertools = "0.10.3"
 md5 = "0.7.0"
 once_cell = "1.13.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2.7"
 rand = "0.8.3"
+regex = "1.4.5"
 reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] }
 routerify = "3"
 rustls = "0.20.0"
@@ -33,12 +36,14 @@ sha2 = "0.10.2"
 socket2 = "0.4.4"
 thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-rustls = "0.23.0"
+tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] }
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
+webpki-roots = "0.22.5"
 x509-parser = "0.14"
 
 metrics = { path = "../libs/metrics" }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4b937f017a..e6a179a040 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -8,7 +8,9 @@ pub use console::{GetAuthInfoError, WakeComputeError};
 
 use crate::{
     auth::{self, AuthFlow, ClientCredentials},
-    compute, http, mgmt, stream, url,
+    compute,
+    console::messages::MetricsAuxInfo,
+    http, mgmt, stream, url,
     waiters::{self, Waiter, Waiters},
 };
 use once_cell::sync::Lazy;
@@ -126,25 +128,13 @@ pub struct AuthSuccess<T> {
     pub value: T,
 }
 
-impl<T> AuthSuccess<T> {
-    /// Very similar to [`std::option::Option::map`].
-    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
-    /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
-        AuthSuccess {
-            reported_auth_ok: self.reported_auth_ok,
-            value: f(self.value),
-        }
-    }
-}
-
 /// Info for establishing a connection to a compute node.
 /// This is what we get after auth succeeded, but not before!
 pub struct NodeInfo {
-    /// Project from [`auth::ClientCredentials`].
-    pub project: String,
     /// Compute node connection params.
     pub config: compute::ConnCfg,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
 }
 
 impl BackendType<'_, ClientCredentials<'_>> {
@@ -159,7 +149,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
         // If there's no project so far, that entails that client doesn't
         // support SNI or other means of passing the project name.
         // We now expect to see a very specific payload in the place of password.
-        let fetch_magic_payload = async {
+        let fetch_magic_payload = |client| async {
             warn!("project name not specified, resorting to the password hack auth flow");
             let payload = AuthFlow::new(client)
                 .begin(auth::PasswordHack)
@@ -171,38 +161,61 @@ impl BackendType<'_, ClientCredentials<'_>> {
             auth::Result::Ok(payload)
         };
 
+        // If we want to use cleartext password flow, we can read the password
+        // from the client and pretend that it's a magic payload (PasswordHack hack).
+        let fetch_plaintext_password = |client| async {
+            info!("using cleartext password flow");
+            let payload = AuthFlow::new(client)
+                .begin(auth::CleartextPassword)
+                .await?
+                .authenticate()
+                .await?;
+
+            auth::Result::Ok(auth::password_hack::PasswordHackPayload {
+                project: String::new(),
+                password: payload,
+            })
+        };
+
         // TODO: find a proper way to merge those very similar blocks.
-        let (mut config, payload) = match self {
+        let (mut node, payload) = match self {
             Console(endpoint, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload.await?;
+                let payload = fetch_magic_payload(client).await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
-                let config = console::Api::new(endpoint, extra, &creds)
+                let node = console::Api::new(endpoint, extra, &creds)
                     .wake_compute()
                     .await?;
 
-                (config, payload)
+                (node, payload)
+            }
+            Console(endpoint, creds) if creds.use_cleartext_password_flow => {
+                // This is a hack to allow cleartext password in secure connections (wss).
+                let payload = fetch_plaintext_password(client).await?;
+                let creds = creds.as_ref();
+                let node = console::Api::new(endpoint, extra, &creds)
+                    .wake_compute()
+                    .await?;
+
+                (node, payload)
             }
             Postgres(endpoint, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload.await?;
+                let payload = fetch_magic_payload(client).await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
-                let config = postgres::Api::new(endpoint, &creds).wake_compute().await?;
+                let node = postgres::Api::new(endpoint, &creds).wake_compute().await?;
 
-                (config, payload)
+                (node, payload)
             }
             _ => return Ok(None),
         };
 
-        config.password(payload.password);
+        node.config.password(payload.password);
         Ok(Some(AuthSuccess {
             reported_auth_ok: false,
-            value: NodeInfo {
-                project: payload.project,
-                config,
-            },
+            value: node,
         }))
     }
 
@@ -233,10 +246,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                 console::Api::new(&endpoint, extra, &creds)
                     .handle_user(client)
                     .await?
-                    .map(|config| NodeInfo {
-                        project: creds.project.unwrap().into_owned(),
-                        config,
-                    })
             }
             Postgres(endpoint, creds) => {
                 info!("performing mock authentication using a local postgres instance");
@@ -245,10 +254,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                 postgres::Api::new(&endpoint, &creds)
                     .handle_user(client)
                     .await?
-                    .map(|config| NodeInfo {
-                        project: creds.project.unwrap().into_owned(),
-                        config,
-                    })
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs
index 040870fc8e..b3e3fd0c10 100644
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -1,16 +1,16 @@
 //! Cloud API V2.
 
-use super::{AuthSuccess, ConsoleReqExtra};
+use super::{AuthSuccess, ConsoleReqExtra, NodeInfo};
 use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
+    console::messages::{ConsoleError, GetRoleSecret, WakeCompute},
     error::{io_error, UserFacingError},
     http, sasl, scram,
     stream::PqStream,
 };
 use futures::TryFutureExt;
 use reqwest::StatusCode as HttpStatusCode;
-use serde::Deserialize;
 use std::future::Future;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -136,24 +136,6 @@ impl UserFacingError for WakeComputeError {
     }
 }
 
-/// Console's response which holds client's auth secret.
-#[derive(Deserialize, Debug)]
-struct GetRoleSecret {
-    role_secret: Box<str>,
-}
-
-/// Console's response which holds compute node's `host:port` pair.
-#[derive(Deserialize, Debug)]
-struct WakeCompute {
-    address: Box<str>,
-}
-
-/// Console's error response with human-readable description.
-#[derive(Deserialize, Debug)]
-struct ConsoleError {
-    error: Box<str>,
-}
-
 /// Auth secret which is managed by the cloud.
 pub enum AuthInfo {
     /// Md5 hash of user's password.
@@ -194,7 +176,7 @@ impl<'a> Api<'a> {
     pub(super) async fn handle_user(
         &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
         handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
 }
@@ -238,7 +220,7 @@ impl Api<'_> {
     }
 
     /// Wake up the compute node and return the corresponding connection info.
-    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let request_id = uuid::Uuid::new_v4().to_string();
         async {
             let request = self
@@ -269,7 +251,10 @@ impl Api<'_> {
                 .dbname(self.creds.dbname)
                 .user(self.creds.user);
 
-            Ok(config)
+            Ok(NodeInfo {
+                config,
+                aux: body.aux,
+            })
         }
         .map_err(crate::error::log_error)
         .instrument(info_span!("wake_compute", id = request_id))
@@ -284,11 +269,11 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
     endpoint: &'a Endpoint,
     get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo,
     wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
-) -> auth::Result<AuthSuccess<compute::ConnCfg>>
+) -> auth::Result<AuthSuccess<NodeInfo>>
 where
     Endpoint: AsRef<ClientCredentials<'a>>,
     GetAuthInfo: Future<Output = Result<Option<AuthInfo>, GetAuthInfoError>>,
-    WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
+    WakeCompute: Future<Output = Result<NodeInfo, WakeComputeError>>,
 {
     let creds = endpoint.as_ref();
 
@@ -325,19 +310,20 @@ where
         }
     };
 
-    let mut config = wake_compute(endpoint).await?;
+    let mut node = wake_compute(endpoint).await?;
     if let Some(keys) = scram_keys {
-        config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
+        use tokio_postgres::config::AuthKeys;
+        node.config.auth_keys(AuthKeys::ScramSha256(keys));
     }
 
     Ok(AuthSuccess {
         reported_auth_ok: false,
-        value: config,
+        value: node,
     })
 }
 
 /// Parse http response body, taking status code into account.
-async fn parse_body<T: for<'a> Deserialize<'a>>(
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     response: reqwest::Response,
 ) -> Result<T, ApiError> {
     let status = response.status();
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 440a55f194..e16bbc70e4 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,6 @@
 use super::{AuthSuccess, NodeInfo};
 use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
-use pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
@@ -60,7 +60,7 @@ pub async fn handle_user(
         info!(parent: &span, "sending the auth URL to the user");
         client
             .write_message_noflush(&Be::AuthenticationOk)?
-            .write_message_noflush(&BeParameterStatusMessage::encoding())?
+            .write_message_noflush(&Be::CLIENT_ENCODING)?
             .write_message(&Be::NoticeResponse(&greeting))
             .await?;
 
@@ -86,8 +86,8 @@ pub async fn handle_user(
     Ok(AuthSuccess {
         reported_auth_ok: true,
         value: NodeInfo {
-            project: db_info.project,
             config,
+            aux: db_info.aux,
         },
     })
 }
diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs
index 8f16dc9fa8..260342f103 100644
--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -2,7 +2,7 @@
 
 use super::{
     console::{self, AuthInfo, GetAuthInfoError, WakeComputeError},
-    AuthSuccess,
+    AuthSuccess, NodeInfo,
 };
 use crate::{
     auth::{self, ClientCredentials},
@@ -57,7 +57,7 @@ impl<'a> Api<'a> {
     pub(super) async fn handle_user(
         &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
         // We reuse user handling logic from a production module.
         console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
@@ -103,7 +103,7 @@ impl Api<'_> {
     }
 
     /// We don't need to wake anything locally, so we just return the connection info.
-    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let mut config = compute::ConnCfg::new();
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
@@ -111,7 +111,10 @@ impl Api<'_> {
             .dbname(self.creds.dbname)
             .user(self.creds.user);
 
-        Ok(config)
+        Ok(NodeInfo {
+            config,
+            aux: Default::default(),
+        })
     }
 }
 
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 0a3b84bb52..3b71bef9aa 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -34,6 +34,9 @@ pub struct ClientCredentials<'a> {
     pub user: &'a str,
     pub dbname: &'a str,
     pub project: Option<Cow<'a, str>>,
+    /// If `True`, we'll use the old cleartext password flow. This is used for
+    /// websocket connections, which want to minimize the number of round trips.
+    pub use_cleartext_password_flow: bool,
 }
 
 impl ClientCredentials<'_> {
@@ -50,6 +53,7 @@ impl<'a> ClientCredentials<'a> {
             user: self.user,
             dbname: self.dbname,
             project: self.project().map(Cow::Borrowed),
+            use_cleartext_password_flow: self.use_cleartext_password_flow,
         }
     }
 }
@@ -59,6 +63,7 @@ impl<'a> ClientCredentials<'a> {
         params: &'a StartupMessageParams,
         sni: Option<&str>,
         common_name: Option<&str>,
+        use_cleartext_password_flow: bool,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
@@ -108,6 +113,7 @@ impl<'a> ClientCredentials<'a> {
             user = user,
             dbname = dbname,
             project = project.as_deref(),
+            use_cleartext_password_flow = use_cleartext_password_flow,
             "credentials"
         );
 
@@ -115,6 +121,7 @@ impl<'a> ClientCredentials<'a> {
             user,
             dbname,
             project,
+            use_cleartext_password_flow,
         })
     }
 }
@@ -141,7 +148,7 @@ mod tests {
         let options = StartupMessageParams::new([("user", "john_doe")]);
 
         // TODO: check that `creds.dbname` is None.
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
 
         Ok(())
@@ -151,7 +158,7 @@ mod tests {
     fn parse_missing_project() -> anyhow::Result<()> {
         let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
 
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project, None);
@@ -166,7 +173,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -182,7 +189,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -201,7 +208,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -220,7 +227,8 @@ mod tests {
         let sni = Some("second.localhost");
         let common_name = Some("localhost");
 
-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err =
+            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -237,7 +245,8 @@ mod tests {
         let sni = Some("project.localhost");
         let common_name = Some("example.com");
 
-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err =
+            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
         match err {
             InconsistentSni { sni, cn } => {
                 assert_eq!(sni, "project.localhost");
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index d9ee50894d..4b982c0c5e 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -37,6 +37,17 @@ impl AuthMethod for PasswordHack {
     }
 }
 
+/// Use clear-text password auth called `password` in docs
+/// <https://www.postgresql.org/docs/current/auth-password.html>
+pub struct CleartextPassword;
+
+impl AuthMethod for CleartextPassword {
+    #[inline(always)]
+    fn first_message(&self) -> BeMessage<'_> {
+        Be::AuthenticationCleartextPassword
+    }
+}
+
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
 pub struct AuthFlow<'a, Stream, State> {
@@ -86,6 +97,18 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
     }
 }
 
+impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
+    /// Perform user authentication. Raise an error in case authentication failed.
+    pub async fn authenticate(self) -> super::Result<Vec<u8>> {
+        let msg = self.stream.read_password_message().await?;
+        let password = msg
+            .strip_suffix(&[0])
+            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+
+        Ok(password.to_vec())
+    }
+}
+
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 4c5edb9673..094db73061 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,18 +8,17 @@ use tokio::net::TcpStream;
 use tokio_postgres::NoTls;
 use tracing::{error, info};
 
+const COULD_NOT_CONNECT: &str = "Could not connect to compute node";
+
 #[derive(Debug, Error)]
 pub enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
     /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such.
-    #[error("Failed to connect to the compute node: {0}")]
+    #[error("{COULD_NOT_CONNECT}: {0}")]
     Postgres(#[from] tokio_postgres::Error),
 
-    #[error("Failed to connect to the compute node")]
-    FailedToConnectToCompute,
-
-    #[error("Failed to fetch compute node version")]
-    FailedToFetchPgVersion,
+    #[error("{COULD_NOT_CONNECT}: {0}")]
+    CouldNotConnect(#[from] io::Error),
 }
 
 impl UserFacingError for ConnectionError {
@@ -29,10 +28,10 @@ impl UserFacingError for ConnectionError {
             // This helps us drop irrelevant library-specific prefixes.
             // TODO: propagate severity level and other parameters.
             Postgres(err) => match err.as_db_error() {
-                Some(err) => err.message().to_string(),
+                Some(err) => err.message().to_owned(),
                 None => err.to_string(),
             },
-            other => other.to_string(),
+            _ => COULD_NOT_CONNECT.to_owned(),
         }
     }
 }
@@ -44,12 +43,12 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[repr(transparent)]
-pub struct ConnCfg(pub tokio_postgres::Config);
+pub struct ConnCfg(Box<tokio_postgres::Config>);
 
 impl ConnCfg {
     /// Construct a new connection config.
     pub fn new() -> Self {
-        Self(tokio_postgres::Config::new())
+        Self(Default::default())
     }
 }
 
@@ -95,7 +94,7 @@ impl ConnCfg {
                 io::ErrorKind::Other,
                 format!(
                     "couldn't connect: bad compute config, \
-                        ports and hosts entries' count does not match: {:?}",
+                     ports and hosts entries' count does not match: {:?}",
                     self.0
                 ),
             ));
@@ -131,8 +130,8 @@ impl ConnCfg {
 pub struct PostgresConnection {
     /// Socket connected to a compute node.
     pub stream: TcpStream,
-    /// PostgreSQL version of this instance.
-    pub version: String,
+    /// PostgreSQL connection parameters.
+    pub params: std::collections::HashMap<String, String>,
 }
 
 impl ConnCfg {
@@ -156,6 +155,7 @@ impl ConnCfg {
             self.0.application_name(app_name);
         }
 
+        // TODO: This is especially ugly...
         if let Some(replication) = params.get("replication") {
             use tokio_postgres::config::ReplicationMode;
             match replication {
@@ -172,22 +172,24 @@ impl ConnCfg {
         // TODO: extend the list of the forwarded startup parameters.
         // Currently, tokio-postgres doesn't allow us to pass
         // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
 
-        let (socket_addr, mut stream) = self
-            .connect_raw()
-            .await
-            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;
-
-        // TODO: establish a secure connection to the DB
-        let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?;
-        let version = conn
-            .parameter("server_version")
-            .ok_or(ConnectionError::FailedToFetchPgVersion)?
-            .into();
-
+        // TODO: establish a secure connection to the DB.
+        let (socket_addr, mut stream) = self.connect_raw().await?;
+        let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?;
         info!("connected to user's compute node at {socket_addr}");
+
+        // This is very ugly but as of now there's no better way to
+        // extract the connection parameters from tokio-postgres' connection.
+        // TODO: solve this problem in a more elegant manner (e.g. the new library).
+        let params = connection.parameters;
+
+        // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
+        // Yet another reason to rework the connection establishing code.
         let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
-        let db = PostgresConnection { stream, version };
+        let db = PostgresConnection { stream, params };
 
         Ok((db, cancel_closure))
     }
diff --git a/proxy/src/console.rs b/proxy/src/console.rs
new file mode 100644
index 0000000000..78f09ac9e1
--- /dev/null
+++ b/proxy/src/console.rs
@@ -0,0 +1,5 @@
+///! Various stuff for dealing with the Neon Console.
+///! Later we might move some API wrappers here.
+
+/// Payloads used in the console's APIs.
+pub mod messages;
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
new file mode 100644
index 0000000000..63a97069b8
--- /dev/null
+++ b/proxy/src/console/messages.rs
@@ -0,0 +1,190 @@
+use serde::Deserialize;
+use std::fmt;
+
+/// Generic error response with human-readable description.
+/// Note that we can't always present it to user as is.
+#[derive(Debug, Deserialize)]
+pub struct ConsoleError {
+    pub error: Box<str>,
+}
+
+/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
+/// Returned by the `/proxy_get_role_secret` API method.
+#[derive(Deserialize)]
+pub struct GetRoleSecret {
+    pub role_secret: Box<str>,
+}
+
+// Manually implement debug to omit sensitive info.
+impl fmt::Debug for GetRoleSecret {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("GetRoleSecret").finish_non_exhaustive()
+    }
+}
+
+/// Response which holds compute node's `host:port` pair.
+/// Returned by the `/proxy_wake_compute` API method.
+#[derive(Debug, Deserialize)]
+pub struct WakeCompute {
+    pub address: Box<str>,
+    pub aux: MetricsAuxInfo,
+}
+
+/// Async response which concludes the link auth flow.
+/// Also known as `kickResponse` in the console.
+#[derive(Debug, Deserialize)]
+pub struct KickSession<'a> {
+    /// Session ID is assigned by the proxy.
+    pub session_id: &'a str,
+
+    /// Compute node connection params.
+    #[serde(deserialize_with = "KickSession::parse_db_info")]
+    pub result: DatabaseInfo,
+}
+
+impl KickSession<'_> {
+    fn parse_db_info<'de, D>(des: D) -> Result<DatabaseInfo, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        enum Wrapper {
+            // Currently, console only reports `Success`.
+            // `Failure(String)` used to be here... RIP.
+            Success(DatabaseInfo),
+        }
+
+        Wrapper::deserialize(des).map(|x| match x {
+            Wrapper::Success(info) => info,
+        })
+    }
+}
+
+/// Compute node connection params.
+#[derive(Deserialize)]
+pub struct DatabaseInfo {
+    pub host: String,
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+    /// Console always provides a password, but it might
+    /// be inconvenient for debug with local PG instance.
+    pub password: Option<String>,
+    pub aux: MetricsAuxInfo,
+}
+
+// Manually implement debug to omit sensitive info.
+impl fmt::Debug for DatabaseInfo {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("DatabaseInfo")
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .field("dbname", &self.dbname)
+            .field("user", &self.user)
+            .finish_non_exhaustive()
+    }
+}
+
+/// Various labels for prometheus metrics.
+/// Also known as `ProxyMetricsAuxInfo` in the console.
+#[derive(Debug, Deserialize, Default)]
+pub struct MetricsAuxInfo {
+    pub endpoint_id: Box<str>,
+    pub project_id: Box<str>,
+    pub branch_id: Box<str>,
+}
+
+impl MetricsAuxInfo {
+    /// Definitions of labels for traffic metric.
+    pub const TRAFFIC_LABELS: &'static [&'static str] = &[
+        // Received (rx) / sent (tx).
+        "direction",
+        // ID of a project.
+        "project_id",
+        // ID of an endpoint within a project.
+        "endpoint_id",
+        // ID of a branch within a project (snapshot).
+        "branch_id",
+    ];
+
+    /// Values of labels for traffic metric.
+    // TODO: add more type safety (validate arity & positions).
+    pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] {
+        [
+            direction,
+            &self.project_id,
+            &self.endpoint_id,
+            &self.branch_id,
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    fn dummy_aux() -> serde_json::Value {
+        json!({
+            "endpoint_id": "endpoint",
+            "project_id": "project",
+            "branch_id": "branch",
+        })
+    }
+
+    #[test]
+    fn parse_kick_session() -> anyhow::Result<()> {
+        // This is what the console's kickResponse looks like.
+        let json = json!({
+            "session_id": "deadbeef",
+            "result": {
+                "Success": {
+                    "host": "localhost",
+                    "port": 5432,
+                    "dbname": "postgres",
+                    "user": "john_doe",
+                    "password": "password",
+                    "aux": dummy_aux(),
+                }
+            }
+        });
+        let _: KickSession = serde_json::from_str(&json.to_string())?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_db_info() -> anyhow::Result<()> {
+        // with password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "aux": dummy_aux(),
+        }))?;
+
+        // without password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "aux": dummy_aux(),
+        }))?;
+
+        // new field (forward compatibility)
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "project": "hello_world",
+            "N.E.W": "forward compatibility check",
+            "aux": dummy_aux(),
+        }))?;
+
+        Ok(())
+    }
+}
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 096a33d73d..e847edc8bd 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -1,4 +1,5 @@
 pub mod server;
+pub mod websocket;
 
 use crate::url::ApiUrl;
 
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
new file mode 100644
index 0000000000..33c2752307
--- /dev/null
+++ b/proxy/src/http/websocket.rs
@@ -0,0 +1,263 @@
+use bytes::{Buf, Bytes};
+use futures::{Sink, Stream, StreamExt};
+use hyper::server::accept::{self};
+use hyper::server::conn::AddrIncoming;
+use hyper::upgrade::Upgraded;
+use hyper::{Body, Request, Response, StatusCode};
+use hyper_tungstenite::{tungstenite, WebSocketStream};
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket};
+use pin_project_lite::pin_project;
+use tokio::net::TcpListener;
+
+use std::convert::Infallible;
+use std::future::ready;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use tls_listener::TlsListener;
+
+use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
+
+use tracing::{error, info, info_span, warn, Instrument};
+use utils::http::{error::ApiError, json::json_response};
+
+use crate::cancellation::CancelMap;
+use crate::config::ProxyConfig;
+use crate::proxy::handle_ws_client;
+
+pin_project! {
+    /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite.
+    pub struct WebSocketRW {
+        #[pin]
+        stream: WebSocketStream<Upgraded>,
+        chunk: Option<bytes::Bytes>,
+    }
+}
+
+// FIXME: explain why this is safe or try to remove `unsafe impl`.
+unsafe impl Sync for WebSocketRW {}
+
+impl WebSocketRW {
+    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+        Self {
+            stream,
+            chunk: None,
+        }
+    }
+
+    fn has_chunk(&self) -> bool {
+        if let Some(ref chunk) = self.chunk {
+            chunk.remaining() > 0
+        } else {
+            false
+        }
+    }
+}
+
+fn ws_err_into(e: tungstenite::Error) -> io::Error {
+    io::Error::new(io::ErrorKind::Other, e.to_string())
+}
+
+impl AsyncWrite for WebSocketRW {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        let mut this = self.project();
+        match this.stream.as_mut().poll_ready(cx) {
+            Poll::Ready(Ok(())) => {
+                if let Err(e) = this
+                    .stream
+                    .as_mut()
+                    .start_send(Message::Binary(buf.to_vec()))
+                {
+                    Poll::Ready(Err(ws_err_into(e)))
+                } else {
+                    Poll::Ready(Ok(buf.len()))
+                }
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))),
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+        }
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().stream.poll_flush(cx).map_err(ws_err_into)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().stream.poll_close(cx).map_err(ws_err_into)
+    }
+}
+
+impl AsyncRead for WebSocketRW {
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        if buf.remaining() == 0 {
+            return Poll::Ready(Ok(()));
+        }
+
+        let inner_buf = match self.as_mut().poll_fill_buf(cx) {
+            Poll::Ready(Ok(buf)) => buf,
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        };
+        let len = std::cmp::min(inner_buf.len(), buf.remaining());
+        buf.put_slice(&inner_buf[..len]);
+
+        self.consume(len);
+        Poll::Ready(Ok(()))
+    }
+}
+
+impl AsyncBufRead for WebSocketRW {
+    fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+        loop {
+            if self.as_mut().has_chunk() {
+                let buf = self.project().chunk.as_ref().unwrap().chunk();
+                return Poll::Ready(Ok(buf));
+            } else {
+                match self.as_mut().project().stream.poll_next(cx) {
+                    Poll::Ready(Some(Ok(message))) => match message {
+                        Message::Text(_) => {}
+                        Message::Binary(chunk) => {
+                            *self.as_mut().project().chunk = Some(Bytes::from(chunk));
+                        }
+                        Message::Ping(_) => {
+                            // No need to send a reply: tungstenite takes care of this for you.
+                        }
+                        Message::Pong(_) => {}
+                        Message::Close(_) => {
+                            // No need to send a reply: tungstenite takes care of this for you.
+                            return Poll::Ready(Ok(&[]));
+                        }
+                        Message::Frame(_) => {
+                            unreachable!();
+                        }
+                    },
+                    Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))),
+                    Poll::Ready(None) => return Poll::Ready(Ok(&[])),
+                    Poll::Pending => return Poll::Pending,
+                }
+            }
+        }
+    }
+
+    fn consume(self: Pin<&mut Self>, amt: usize) {
+        if amt > 0 {
+            self.project()
+                .chunk
+                .as_mut()
+                .expect("No chunk present")
+                .advance(amt);
+        }
+    }
+}
+
+async fn serve_websocket(
+    websocket: HyperWebsocket,
+    config: &ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    let websocket = websocket.await?;
+    handle_ws_client(
+        config,
+        cancel_map,
+        session_id,
+        WebSocketRW::new(websocket),
+        hostname,
+    )
+    .await?;
+    Ok(())
+}
+
+async fn ws_handler(
+    mut request: Request<Body>,
+    config: &'static ProxyConfig,
+    cancel_map: Arc<CancelMap>,
+    session_id: uuid::Uuid,
+) -> Result<Response<Body>, ApiError> {
+    let host = request
+        .headers()
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next())
+        .map(|s| s.to_string());
+
+    // Check if the request is a websocket upgrade request.
+    if hyper_tungstenite::is_upgrade_request(&request) {
+        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+            .map_err(|e| ApiError::BadRequest(e.into()))?;
+
+        tokio::spawn(async move {
+            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
+            {
+                error!("error in websocket connection: {:?}", e);
+            }
+        });
+
+        // Return the response so the spawned future can continue.
+        Ok(response)
+    } else {
+        json_response(StatusCode::OK, "Connect with a websocket client")
+    }
+}
+
+pub async fn task_main(
+    ws_listener: TcpListener,
+    config: &'static ProxyConfig,
+) -> anyhow::Result<()> {
+    scopeguard::defer! {
+        info!("websocket server has shut down");
+    }
+
+    let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
+    let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
+        Some(config) => config.into(),
+        None => {
+            warn!("TLS config is missing, WebSocket Secure server will not be started");
+            return Ok(());
+        }
+    };
+
+    let addr_incoming = AddrIncoming::from_listener(ws_listener)?;
+
+    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
+        if let Err(err) = conn {
+            error!("failed to accept TLS connection for websockets: {:?}", err);
+            ready(false)
+        } else {
+            ready(true)
+        }
+    });
+
+    let make_svc = hyper::service::make_service_fn(|_stream| async move {
+        Ok::<_, Infallible>(hyper::service::service_fn(
+            move |req: Request<Body>| async move {
+                let cancel_map = Arc::new(CancelMap::default());
+                let session_id = uuid::Uuid::new_v4();
+                ws_handler(req, config, cancel_map, session_id)
+                    .instrument(info_span!(
+                        "ws-client",
+                        session = format_args!("{session_id}")
+                    ))
+                    .await
+            },
+        ))
+    });
+
+    hyper::Server::builder(accept::from_stream(tls_listener))
+        .serve(make_svc)
+        .await?;
+
+    Ok(())
+}
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 2855d1f900..aa6766c102 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -8,6 +8,7 @@ mod auth;
 mod cancellation;
 mod compute;
 mod config;
+mod console;
 mod error;
 mod http;
 mod mgmt;
@@ -109,12 +110,23 @@ async fn main() -> anyhow::Result<()> {
     info!("Starting proxy on {proxy_address}");
     let proxy_listener = TcpListener::bind(proxy_address).await?;
 
-    let tasks = [
+    let mut tasks = vec![
         tokio::spawn(http::server::task_main(http_listener)),
         tokio::spawn(proxy::task_main(config, proxy_listener)),
         tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)),
-    ]
-    .map(flatten_err);
+    ];
+
+    if let Some(wss_address) = arg_matches.get_one::<String>("wss") {
+        let wss_address: SocketAddr = wss_address.parse()?;
+        info!("Starting wss on {}", wss_address);
+        let wss_listener = TcpListener::bind(wss_address).await?;
+        tasks.push(tokio::spawn(http::websocket::task_main(
+            wss_listener,
+            config,
+        )));
+    }
+
+    let tasks = tasks.into_iter().map(flatten_err);
 
     set_build_info_metric(GIT_VERSION);
     // This will block until all tasks have completed.
@@ -154,6 +166,11 @@ fn cli() -> clap::Command {
                 .help("listen for incoming http connections (metrics, etc) on ip:port")
                 .default_value("127.0.0.1:7001"),
         )
+        .arg(
+            Arg::new("wss")
+                .long("wss")
+                .help("listen for incoming wss connections on ip:port"),
+        )
         .arg(
             Arg::new("uri")
                 .short('u')
diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs
index 23e10b5a9b..cf83b48ae0 100644
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -1,13 +1,18 @@
-use crate::auth;
+use crate::{
+    auth,
+    console::messages::{DatabaseInfo, KickSession},
+};
 use anyhow::Context;
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use serde::Deserialize;
 use std::{
     net::{TcpListener, TcpStream},
     thread,
 };
 use tracing::{error, info, info_span};
-use utils::postgres_backend::{self, AuthType, PostgresBackend};
+use utils::{
+    postgres_backend::{self, AuthType, PostgresBackend},
+    postgres_backend_async::QueryError,
+};
 
 /// Console management API listener thread.
 /// It spawns console response handlers needed for the link auth.
@@ -45,68 +50,18 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
     }
 }
 
-fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
+fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
     let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
     pgbackend.run(&mut MgmtHandler)
 }
 
-/// Known as `kickResponse` in the console.
-#[derive(Debug, Deserialize)]
-struct PsqlSessionResponse {
-    session_id: String,
-    result: PsqlSessionResult,
-}
-
-#[derive(Debug, Deserialize)]
-enum PsqlSessionResult {
-    Success(DatabaseInfo),
-    Failure(String),
-}
-
 /// A message received by `mgmt` when a compute node is ready.
 pub type ComputeReady = Result<DatabaseInfo, String>;
 
-impl PsqlSessionResult {
-    fn into_compute_ready(self) -> ComputeReady {
-        match self {
-            Self::Success(db_info) => Ok(db_info),
-            Self::Failure(message) => Err(message),
-        }
-    }
-}
-
-/// Compute node connection params provided by the console.
-/// This struct and its parents are mgmt API implementation
-/// detail and thus should remain in this module.
-// TODO: restore deserialization tests from git history.
-#[derive(Deserialize)]
-pub struct DatabaseInfo {
-    pub host: String,
-    pub port: u16,
-    pub dbname: String,
-    pub user: String,
-    /// Console always provides a password, but it might
-    /// be inconvenient for debug with local PG instance.
-    pub password: Option<String>,
-    pub project: String,
-}
-
-// Manually implement debug to omit sensitive info.
-impl std::fmt::Debug for DatabaseInfo {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
-        fmt.debug_struct("DatabaseInfo")
-            .field("host", &self.host)
-            .field("port", &self.port)
-            .field("dbname", &self.dbname)
-            .field("user", &self.user)
-            .finish_non_exhaustive()
-    }
-}
-
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
 impl postgres_backend::Handler for MgmtHandler {
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
+    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
         try_process_query(pgb, query).map_err(|e| {
             error!("failed to process response: {e:?}");
             e
@@ -114,14 +69,14 @@ impl postgres_backend::Handler for MgmtHandler {
     }
 }
 
-fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
-    let resp: PsqlSessionResponse = serde_json::from_str(query)?;
+fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
+    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
     info!("got response: {:?}", resp.result);
 
-    match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
+    match auth::backend::notify(resp.session_id, Ok(resp.result)) {
         Ok(()) => {
             pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                 .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
@@ -129,49 +84,9 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
         }
         Err(e) => {
             error!("failed to deliver response to per-client task");
-            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?;
         }
     }
 
     Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn parse_db_info() -> anyhow::Result<()> {
-        // with password
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "password": "password",
-            "project": "hello_world",
-        }))?;
-
-        // without password
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "project": "hello_world",
-        }))?;
-
-        // new field (forward compatibility)
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "project": "hello_world",
-            "N.E.W": "forward compatibility check",
-        }))?;
-
-        Ok(())
-    }
-}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index da3cb144e3..63573d49c0 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -11,7 +11,7 @@ use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use pq_proto::{BeMessage as Be, *};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, info_span, Instrument};
@@ -39,12 +39,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "proxy_io_bytes_per_client",
         "Number of bytes sent/received between client and backend.",
-        &[
-            // Received (rx) / sent (tx).
-            "direction",
-            // Proxy can keep calling it `project` internally.
-            "endpoint_id"
-        ]
+        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
     )
     .unwrap()
 });
@@ -87,6 +82,47 @@ pub async fn task_main(
     }
 }
 
+pub async fn handle_ws_client(
+    config: &ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    stream: impl AsyncRead + AsyncWrite + Unpin + Send,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    // The `closed` counter will increase when this future is destroyed.
+    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
+    scopeguard::defer! {
+        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
+    }
+
+    let tls = config.tls_config.as_ref();
+    let hostname = hostname.as_deref();
+
+    // TLS is None here, because the connection is already encrypted.
+    let do_handshake = handshake(stream, None, cancel_map).instrument(info_span!("handshake"));
+    let (mut stream, params) = match do_handshake.await? {
+        Some(x) => x,
+        None => return Ok(()), // it's a cancellation request
+    };
+
+    // Extract credentials which we're going to use for auth.
+    let creds = {
+        let common_name = tls.and_then(|tls| tls.common_name.as_deref());
+        let result = config
+            .auth_backend
+            .as_ref()
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
+            .transpose();
+
+        async { result }.or_else(|e| stream.throw_error(e)).await?
+    };
+
+    let client = Client::new(stream, creds, &params, session_id);
+    cancel_map
+        .with_session(|session| client.connect_to_db(session))
+        .await
+}
+
 async fn handle_client(
     config: &ProxyConfig,
     cancel_map: &CancelMap,
@@ -113,7 +149,7 @@ async fn handle_client(
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -255,29 +291,32 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
         // Note that we do this only (for the most part) after we've connected
         // to a compute (see above) which performs its own authentication.
         if !auth_result.reported_auth_ok {
-            stream
-                .write_message_noflush(&Be::AuthenticationOk)?
-                .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+            stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
+        // Forward all postgres connection params to the client.
+        // Right now the implementation is very hacky and inefficent (ideally,
+        // we don't need an intermediate hashmap), but at least it should be correct.
+        for (name, value) in &db.params {
+            // TODO: Theoretically, this could result in a big pile of params...
+            stream.write_message_noflush(&Be::ParameterStatus {
+                name: name.as_bytes(),
+                value: value.as_bytes(),
+            })?;
         }
 
         stream
-            .write_message_noflush(&BeMessage::ParameterStatus(
-                BeParameterStatusMessage::ServerVersion(&db.version),
-            ))?
             .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
-            .write_message(&BeMessage::ReadyForQuery)
+            .write_message(&Be::ReadyForQuery)
             .await?;
 
-        // TODO: add more identifiers.
-        let metric_id = node.project;
-
-        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]);
+        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("tx"));
         let mut client = MeasuredStream::new(stream.into_inner(), |cnt| {
             // Number of bytes we sent to the client (outbound).
             m_sent.inc_by(cnt as u64);
         });
 
-        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]);
+        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("rx"));
         let mut db = MeasuredStream::new(db.stream, |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
             m_recv.inc_by(cnt as u64);
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 24fbc57b99..ed429df421 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -139,8 +139,8 @@ async fn dummy_proxy(
 
     stream
         .write_message_noflush(&Be::AuthenticationOk)?
-        .write_message_noflush(&BeParameterStatusMessage::encoding())?
-        .write_message(&BeMessage::ReadyForQuery)
+        .write_message_noflush(&Be::CLIENT_ENCODING)?
+        .write_message(&Be::ReadyForQuery)
         .await?;
 
     Ok(())
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 89668465fa..424beccec9 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -48,7 +48,7 @@ impl ServerSecret {
 
         Self {
             iterations: 4096,
-            salt_base64: base64::encode(&mocked_salt),
+            salt_base64: base64::encode(mocked_salt),
             stored_key: ScramKey::default(),
             server_key: ScramKey::default(),
             doomed: true,
@@ -68,7 +68,7 @@ impl ServerSecret {
 
         Some(Self {
             iterations,
-            salt_base64: base64::encode(&salt),
+            salt_base64: base64::encode(salt),
             stored_key: password.client_key().sha256(),
             server_key: password.server_key(),
             doomed: false,
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 19e1479068..02a0fabe9a 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -2,7 +2,7 @@ use crate::error::UserFacingError;
 use anyhow::bail;
 use bytes::BytesMut;
 use pin_project_lite::pin_project;
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket};
 use rustls::ServerConfig;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -47,18 +47,13 @@ fn err_connection() -> io::Error {
     io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost")
 }
 
-// TODO: change error type of `FeMessage::read_fut`
-fn from_anyhow(e: anyhow::Error) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, e.to_string())
-}
-
 impl<S: AsyncRead + Unpin> PqStream<S> {
     /// Receive [`FeStartupPacket`], which is a first packet sent by a client.
     pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
         // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket`
         let msg = FeStartupPacket::read_fut(&mut self.stream)
             .await
-            .map_err(from_anyhow)?
+            .map_err(ConnectionError::into_io_error)?
             .ok_or_else(err_connection)?;
 
         match msg {
@@ -80,7 +75,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
     async fn read_message(&mut self) -> io::Result<FeMessage> {
         FeMessage::read_fut(&mut self.stream)
             .await
-            .map_err(from_anyhow)?
+            .map_err(ConnectionError::into_io_error)?
             .ok_or_else(err_connection)
     }
 }
@@ -112,7 +107,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
     pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
         tracing::info!("forwarding error to user: {error}");
-        self.write_message(&BeMessage::ErrorResponse(error)).await?;
+        self.write_message(&BeMessage::ErrorResponse(error, None))
+            .await?;
         bail!(error)
     }
 
@@ -124,7 +120,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     {
         let msg = error.to_string_client();
         tracing::info!("forwarding error to user: {msg}");
-        self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
+        self.write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await?;
         bail!(error)
     }
 }
diff --git a/pyproject.toml b/pyproject.toml
index b297f7f70b..b4fb7a9e7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,10 +32,11 @@ toml = "^0.10.2"
 psutil = "^5.9.4"
 types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
+pytest-httpserver = "^1.0.6"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
-mypy = "==0.971"
+mypy = "==0.991"
 black = "^22.6.0"
 isort = "^5.10.1"
 
@@ -60,10 +61,8 @@ skip = [
 ]
 
 [tool.mypy]
-# mypy uses regex
 exclude = "^vendor/"
-# some tests don't typecheck when this flag is set
-check_untyped_defs = false
+check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
 mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner"
diff --git a/run_clippy.sh b/run_clippy.sh
index bf770432d0..fe0e745d7d 100755
--- a/run_clippy.sh
+++ b/run_clippy.sh
@@ -9,8 +9,8 @@
 # In vscode, this setting is Rust-analyzer>Check On Save:Command
 
 
-# Not every feature is supported in macOS builds, e.g. `profiling`,
-# avoid running regular linting script that checks every feature.
+# Not every feature is supported in macOS builds. Avoid running regular linting
+# script that checks every feature.
 if [[ "$OSTYPE" == "darwin"* ]]; then
     # no extra features to test currently, add more here when needed
     cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index d11ef1711a..d0c804fe4e 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -2,6 +2,7 @@
 name = "safekeeper"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 async-stream = "0.3"
@@ -20,8 +21,8 @@ hyper = "0.14"
 nix = "0.25"
 once_cell = "1.13.0"
 parking_lot = "0.12.1"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1.4.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -29,7 +30,7 @@ serde_with = "2.0"
 signal-hook = "0.3.10"
 thiserror = "1"
 tokio = { version = "1.17", features = ["macros", "fs"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.27"
 url = "2.2.2"
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index cab5053b5b..b130ea86bd 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -82,6 +82,9 @@ struct Args {
     /// established; plaintext otherwise.
     #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)]
     broker_endpoint: Uri,
+    /// Broker keepalive interval.
+    #[arg(long, value_parser= humantime::parse_duration, default_value = storage_broker::DEFAULT_KEEPALIVE_INTERVAL)]
+    broker_keepalive_interval: Duration,
     /// Peer safekeeper is considered dead after not receiving heartbeats from
     /// it during this period passed as a human readable duration.
     #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)]
@@ -126,28 +129,47 @@ fn main() -> anyhow::Result<()> {
     logging::init(LogFormat::from_config(&args.log_format)?)?;
     info!("version: {GIT_VERSION}");
 
+    let args_workdir = &args.datadir;
+    let workdir = args_workdir.canonicalize().with_context(|| {
+        format!("Failed to get the absolute path for input workdir {args_workdir:?}")
+    })?;
+
     // Change into the data directory.
-    std::env::set_current_dir(&args.datadir)?;
+    std::env::set_current_dir(&workdir)?;
 
     // Set or read our ID.
-    let id = set_id(&args.datadir, args.id.map(NodeId))?;
+    let id = set_id(&workdir, args.id.map(NodeId))?;
     if args.init {
         return Ok(());
     }
 
+    let auth = match args.auth_validation_public_key_path.as_ref() {
+        None => {
+            info!("auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!("loading JWT auth key from {}", path.display());
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+
     let conf = SafeKeeperConf {
-        workdir: args.datadir,
+        workdir,
         my_id: id,
         listen_pg_addr: args.listen_pg,
         listen_http_addr: args.listen_http,
         no_sync: args.no_sync,
         broker_endpoint: args.broker_endpoint,
+        broker_keepalive_interval: args.broker_keepalive_interval,
         heartbeat_timeout: args.heartbeat_timeout,
         remote_storage: args.remote_storage,
         max_offloader_lag_bytes: args.max_offloader_lag,
         backup_runtime_threads: args.wal_backup_threads,
         wal_backup_enabled: !args.disable_wal_backup,
-        auth_validation_public_key_path: args.auth_validation_public_key_path,
+        auth,
     };
 
     // initialize sentry if SENTRY_DSN is provided
@@ -177,19 +199,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         e
     })?;
 
-    let auth = match conf.auth_validation_public_key_path.as_ref() {
-        None => {
-            info!("auth is disabled");
-            None
-        }
-        Some(path) => {
-            info!("loading JWT auth key from {}", path.display());
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
-        }
-    };
-
     // Register metrics collector for active timelines. It's important to do this
     // after daemonizing, otherwise process collector will be upset.
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
@@ -203,12 +212,11 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
 
     let conf_ = conf.clone();
-    let auth_ = auth.clone();
     threads.push(
         thread::Builder::new()
             .name("http_endpoint_thread".into())
             .spawn(|| {
-                let router = http::make_router(conf_, auth_);
+                let router = http::make_router(conf_);
                 endpoint::serve_thread_main(
                     router,
                     http_listener,
@@ -221,11 +229,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let conf_cloned = conf.clone();
     let safekeeper_thread = thread::Builder::new()
         .name("safekeeper thread".into())
-        .spawn(|| {
-            if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) {
-                info!("safekeeper thread terminated: {e}");
-            }
-        })
+        .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
         .unwrap();
 
     threads.push(safekeeper_thread);
@@ -235,7 +239,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         thread::Builder::new()
             .name("broker thread".into())
             .spawn(|| {
-                // TODO: add auth?
                 broker::thread_main(conf_);
             })?,
     );
@@ -304,7 +307,8 @@ fn set_id(workdir: &Path, given_id: Option<NodeId>) -> Result<NodeId> {
                 } else {
                     bail!("safekeeper id is not specified");
                 };
-                let mut f = File::create(&id_file_path)?;
+                let mut f = File::create(&id_file_path)
+                    .with_context(|| format!("Failed to create id file at {id_file_path:?}"))?;
                 f.write_all(my_id.to_string().as_bytes())?;
                 f.sync_all()?;
                 info!("initialized safekeeper id {}", my_id);
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index df2dc92efe..92f35bf51f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -66,7 +66,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
 /// Subscribe and fetch all the interesting data from the broker.
 async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
-    let mut client = storage_broker::connect(conf.broker_endpoint)?;
+    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
     // TODO: subscribe only to local timelines instead of all
     let request = SubscribeSafekeeperInfoRequest {
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index f4a0f8520c..ba5e453e41 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -239,7 +239,7 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
         Ok((
             FileStorage::restore_new(ttid, conf)?,
             FileStorage::load_control_file_conf(conf, ttid)?,
@@ -250,7 +250,7 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
         let state = SafeKeeperState::empty();
         let storage = FileStorage::create_new(ttid, conf, state.clone())?;
         Ok((storage, state))
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 05527303ca..60df5dd372 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -8,16 +8,16 @@ use crate::receive_wal::ReceiveWalConn;
 use crate::send_wal::ReplicationConn;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 
 use postgres_ffi::PG_TLI;
 use regex::Regex;
 
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use std::str;
-use std::sync::Arc;
 use tracing::info;
-use utils::auth::{Claims, JwtAuth, Scope};
+use utils::auth::{Claims, Scope};
+use utils::postgres_backend_async::QueryError;
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -32,7 +32,6 @@ pub struct SafekeeperPostgresHandler {
     pub tenant_id: Option<TenantId>,
     pub timeline_id: Option<TimelineId>,
     pub ttid: TenantTimelineId,
-    auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 }
 
@@ -44,7 +43,7 @@ enum SafekeeperPostgresCommand {
     JSONCtrl { cmd: AppendLogicalMessage },
 }
 
-fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
+fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     if cmd.starts_with("START_WAL_PUSH") {
         Ok(SafekeeperPostgresCommand::StartWalPush)
     } else if cmd.starts_with("START_REPLICATION") {
@@ -64,13 +63,17 @@ fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
             cmd: serde_json::from_str(cmd)?,
         })
     } else {
-        bail!("unsupported command {}", cmd);
+        anyhow::bail!("unsupported command {cmd}");
     }
 }
 
 impl postgres_backend::Handler for SafekeeperPostgresHandler {
     // tenant_id and timeline_id are passed in connection string params
-    fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         if let FeStartupPacket::StartupMessage { params, .. } = sm {
             if let Some(options) = params.options_raw() {
                 for opt in options {
@@ -79,10 +82,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
                     // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                     match opt.split_once('=') {
                         Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
-                            self.tenant_id = Some(value.parse()?);
+                            self.tenant_id = Some(value.parse().with_context(|| {
+                                format!("Failed to parse {value} as tenant id")
+                            })?);
                         }
                         Some(("ztimelineid", value)) | Some(("timeline_id", value)) => {
-                            self.timeline_id = Some(value.parse()?);
+                            self.timeline_id = Some(value.parse().with_context(|| {
+                                format!("Failed to parse {value} as timeline id")
+                            })?);
                         }
                         _ => continue,
                     }
@@ -95,7 +102,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
 
             Ok(())
         } else {
-            bail!("Safekeeper received unexpected initial message: {:?}", sm);
+            Err(QueryError::Other(anyhow::anyhow!(
+                "Safekeeper received unexpected initial message: {sm:?}"
+            )))
         }
     }
 
@@ -103,20 +112,20 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         &mut self,
         _pgb: &mut PostgresBackend,
         jwt_response: &[u8],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
+            .conf
             .auth
             .as_ref()
             .unwrap()
-            .decode(str::from_utf8(jwt_response)?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
 
-        if matches!(data.claims.scope, Scope::Tenant) {
-            ensure!(
-                data.claims.tenant_id.is_some(),
+        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
+            return Err(QueryError::Other(anyhow::anyhow!(
                 "jwt token scope is Tenant, but tenant id is missing"
-            )
+            )));
         }
 
         info!(
@@ -128,7 +137,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         Ok(())
     }
 
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> {
+    fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError> {
         if query_string
             .to_ascii_lowercase()
             .starts_with("set datestyle to ")
@@ -149,39 +162,45 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         self.check_permission(Some(tenant_id))?;
         self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
 
-        match cmd {
+        let res = match cmd {
             SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self),
             SafekeeperPostgresCommand::StartReplication { start_lsn } => {
                 ReplicationConn::new(pgb).run(self, pgb, start_lsn)
             }
             SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb),
             SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd),
-        }
-        .context(format!(
-            "Failed to process query for timeline {timeline_id}"
-        ))?;
+        };
 
-        Ok(())
+        match res {
+            Ok(()) => Ok(()),
+            Err(QueryError::Disconnected(connection_error)) => {
+                info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}");
+                Err(QueryError::Disconnected(connection_error))
+            }
+            Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!(
+                "Failed to process query for timeline {}",
+                self.ttid
+            )))),
+        }
     }
 }
 
 impl SafekeeperPostgresHandler {
-    pub fn new(conf: SafeKeeperConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    pub fn new(conf: SafeKeeperConf) -> Self {
         SafekeeperPostgresHandler {
             conf,
             appname: None,
             tenant_id: None,
             timeline_id: None,
             ttid: TenantTimelineId::empty(),
-            auth,
             claims: None,
         }
     }
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
-        if self.auth.is_none() {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
+        if self.conf.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
         }
@@ -198,7 +217,7 @@ impl SafekeeperPostgresHandler {
     ///
     /// Handle IDENTIFY_SYSTEM replication command
     ///
-    fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> {
+    fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> {
         let tli = GlobalTimelines::get(self.ttid)?;
 
         let lsn = if self.is_walproposer_recovery() {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a9a9eb3388..a917d61678 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -277,12 +277,9 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
 }
 
 /// Safekeeper http router.
-pub fn make_router(
-    conf: SafeKeeperConf,
-    auth: Option<Arc<JwtAuth>>,
-) -> RouterBuilder<hyper::Body, ApiError> {
+pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
-    if auth.is_some() {
+    if conf.auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             #[allow(clippy::mutable_key_type)]
             static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -298,6 +295,7 @@ pub fn make_router(
 
     // NB: on any changes do not forget to update the OpenAPI spec
     // located nearby (/safekeeper/src/http/openapi_spec.yaml).
+    let auth = conf.auth.clone();
     router
         .data(Arc::new(conf))
         .data(auth)
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 746b4461b7..32a24a4978 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,11 +8,12 @@
 
 use std::sync::Arc;
 
-use anyhow::Result;
+use anyhow::Context;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use tracing::*;
 use utils::id::TenantTimelineId;
+use utils::postgres_backend_async::QueryError;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -47,7 +48,7 @@ pub struct AppendLogicalMessage {
     pg_version: u32,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 struct AppendResult {
     // safekeeper state after append
     state: SafeKeeperState,
@@ -62,8 +63,8 @@ pub fn handle_json_ctrl(
     spg: &SafekeeperPostgresHandler,
     pgb: &mut PostgresBackend,
     append_request: &AppendLogicalMessage,
-) -> Result<()> {
-    info!("JSON_CTRL request: {:?}", append_request);
+) -> Result<(), QueryError> {
+    info!("JSON_CTRL request: {append_request:?}");
 
     // need to init safekeeper state before AppendRequest
     let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?;
@@ -78,7 +79,8 @@ pub fn handle_json_ctrl(
         state: tli.get_state().1,
         inserted_wal,
     };
-    let response_data = serde_json::to_vec(&response)?;
+    let response_data = serde_json::to_vec(&response)
+        .with_context(|| format!("Response {response:?} is not a json array"))?;
 
     pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor {
         name: b"json",
@@ -93,7 +95,7 @@ pub fn handle_json_ctrl(
 
 /// Prepare safekeeper to process append requests without crashes,
 /// by sending ProposerGreeting with default server.wal_seg_size.
-fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Timeline>> {
+fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result<Arc<Timeline>> {
     GlobalTimelines::create(
         ttid,
         ServerInfo {
@@ -106,7 +108,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Tim
     )
 }
 
-fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()> {
+fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
     // add new term to existing history
     let history = tli.get_state().1.acceptor_state.term_history;
     let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -125,7 +127,7 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()
     Ok(())
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 struct InsertedWAL {
     begin_lsn: Lsn,
     end_lsn: Lsn,
@@ -134,7 +136,10 @@ struct InsertedWAL {
 
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
-fn append_logical_message(tli: &Arc<Timeline>, msg: &AppendLogicalMessage) -> Result<InsertedWAL> {
+fn append_logical_message(
+    tli: &Arc<Timeline>,
+    msg: &AppendLogicalMessage,
+) -> anyhow::Result<InsertedWAL> {
     let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
     let sk_state = tli.get_state().1;
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 60a1911068..891d73533f 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -24,7 +24,9 @@ pub mod wal_service;
 pub mod wal_storage;
 
 mod timelines_global_map;
+use std::sync::Arc;
 pub use timelines_global_map::GlobalTimelines;
+use utils::auth::JwtAuth;
 
 pub mod defaults {
     pub use safekeeper_api::{
@@ -51,12 +53,13 @@ pub struct SafeKeeperConf {
     pub listen_http_addr: String,
     pub no_sync: bool,
     pub broker_endpoint: Uri,
+    pub broker_keepalive_interval: Duration,
     pub heartbeat_timeout: Duration,
     pub remote_storage: Option<RemoteStorageConfig>,
     pub max_offloader_lag_bytes: u64,
     pub backup_runtime_threads: Option<usize>,
     pub wal_backup_enabled: bool,
-    pub auth_validation_public_key_path: Option<PathBuf>,
+    pub auth: Option<Arc<JwtAuth>>,
 }
 
 impl SafeKeeperConf {
@@ -83,9 +86,10 @@ impl SafeKeeperConf {
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint"),
+            broker_keepalive_interval: Duration::from_secs(5),
             backup_runtime_threads: None,
             wal_backup_enabled: true,
-            auth_validation_public_key_path: None,
+            auth: None,
             heartbeat_timeout: Duration::new(5, 0),
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
         }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index d4d3d37737..b21770686c 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -425,7 +425,7 @@ impl Collector for TimelineCollector {
                 .set(tli.num_computes as i64);
             self.acceptor_term
                 .with_label_values(labels)
-                .set(tli.persisted_state.acceptor_state.term as u64);
+                .set(tli.persisted_state.acceptor_state.term);
             self.written_wal_bytes
                 .with_label_values(labels)
                 .set(tli.wal_storage.write_wal_bytes);
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 6577e8c4d6..671e5470a0 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -2,11 +2,13 @@
 //! Gets messages from the network, passes them down to consensus module and
 //! sends replies back.
 
-use anyhow::{anyhow, bail, Result};
+use anyhow::anyhow;
+use anyhow::Context;
 
 use bytes::BytesMut;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::postgres_backend_async::QueryError;
 
 use crate::safekeeper::ServerInfo;
 use crate::timeline::Timeline;
@@ -43,7 +45,7 @@ impl<'pg> ReceiveWalConn<'pg> {
     }
 
     // Send message to the postgres
-    fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> {
+    fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> {
         let mut buf = BytesMut::with_capacity(128);
         msg.serialize(&mut buf)?;
         self.pg_backend.write_message(&BeMessage::CopyData(&buf))?;
@@ -51,8 +53,8 @@ impl<'pg> ReceiveWalConn<'pg> {
     }
 
     /// Receive WAL from wal_proposer
-    pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> {
-        let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered();
+    pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> {
+        let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered();
 
         // Notify the libpq client that it's allowed to send `CopyData` messages
         self.pg_backend
@@ -69,7 +71,7 @@ impl<'pg> ReceiveWalConn<'pg> {
         let tli = match next_msg {
             ProposerAcceptorMessage::Greeting(ref greeting) => {
                 info!(
-                    "start handshake with wal proposer {} sysid {} timeline {}",
+                    "start handshake with walproposer {} sysid {} timeline {}",
                     self.peer_addr, greeting.system_id, greeting.tli,
                 );
                 let server_info = ServerInfo {
@@ -79,7 +81,11 @@ impl<'pg> ReceiveWalConn<'pg> {
                 };
                 GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)?
             }
-            _ => bail!("unexpected message {:?} instead of greeting", next_msg),
+            _ => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message {next_msg:?} instead of greeting"
+                )))
+            }
         };
 
         let mut next_msg = Some(next_msg);
@@ -134,25 +140,32 @@ impl<'pg> ReceiveWalConn<'pg> {
 
 struct ProposerPollStream {
     msg_rx: Receiver<ProposerAcceptorMessage>,
-    read_thread: Option<thread::JoinHandle<Result<()>>>,
+    read_thread: Option<thread::JoinHandle<Result<(), QueryError>>>,
 }
 
 impl ProposerPollStream {
-    fn new(mut r: ReadStream) -> Result<Self> {
+    fn new(mut r: ReadStream) -> anyhow::Result<Self> {
         let (msg_tx, msg_rx) = channel();
 
         let read_thread = thread::Builder::new()
             .name("Read WAL thread".into())
-            .spawn(move || -> Result<()> {
+            .spawn(move || -> Result<(), QueryError> {
                 loop {
                     let copy_data = match FeMessage::read(&mut r)? {
-                        Some(FeMessage::CopyData(bytes)) => bytes,
-                        Some(msg) => bail!("expected `CopyData` message, found {:?}", msg),
-                        None => bail!("connection closed unexpectedly"),
-                    };
+                        Some(FeMessage::CopyData(bytes)) => Ok(bytes),
+                        Some(msg) => Err(QueryError::Other(anyhow::anyhow!(
+                            "expected `CopyData` message, found {msg:?}"
+                        ))),
+                        None => Err(QueryError::from(std::io::Error::new(
+                            std::io::ErrorKind::ConnectionAborted,
+                            "walproposer closed the connection",
+                        ))),
+                    }?;
 
                     let msg = ProposerAcceptorMessage::parse(copy_data)?;
-                    msg_tx.send(msg)?;
+                    msg_tx
+                        .send(msg)
+                        .context("Failed to send the proposer message")?;
                 }
                 // msg_tx will be dropped here, this will also close msg_rx
             })?;
@@ -163,17 +176,19 @@ impl ProposerPollStream {
         })
     }
 
-    fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage> {
+    fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage, QueryError> {
         self.msg_rx.recv().map_err(|_| {
             // return error from the read thread
             let res = match self.read_thread.take() {
                 Some(thread) => thread.join(),
-                None => return anyhow!("read thread is gone"),
+                None => return QueryError::Other(anyhow::anyhow!("read thread is gone")),
             };
 
             match res {
-                Ok(Ok(())) => anyhow!("unexpected result from read thread"),
-                Err(err) => anyhow!("read thread panicked: {:?}", err),
+                Ok(Ok(())) => {
+                    QueryError::Other(anyhow::anyhow!("unexpected result from read thread"))
+                }
+                Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")),
                 Ok(Err(err)) => err,
             }
         })
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 2c13f81476..fa973a3ede 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -182,7 +182,7 @@ pub struct SafeKeeperState {
     /// All WAL segments next to one containing local_start_lsn are
     /// filled with data from the beginning.
     pub local_start_lsn: Lsn,
-    /// Part of WAL acknowledged by quorum and available locally. Always points
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
     /// to record boundary.
     pub commit_lsn: Lsn,
     /// LSN that points to the end of the last backed up segment. Useful to
@@ -501,10 +501,6 @@ impl AcceptorProposerMessage {
 /// - messages from compute (proposers) and provides replies
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
-    /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn.
-    /// Note: be careful to set only if we are sure our WAL (term history) matches
-    /// committed one.
-    pub global_commit_lsn: Lsn,
     /// LSN since the proposer safekeeper currently talking to appends WAL;
     /// determines epoch switch point.
     pub epoch_start_lsn: Lsn,
@@ -537,7 +533,6 @@ where
         }
 
         Ok(SafeKeeper {
-            global_commit_lsn: state.commit_lsn,
             epoch_start_lsn: Lsn(0),
             inmem: SafekeeperMemState {
                 commit_lsn: state.commit_lsn,
@@ -639,10 +634,12 @@ where
 
         // system_id will be updated on mismatch
         if self.state.server.system_id != msg.system_id {
-            warn!(
-                "unexpected system ID arrived, got {}, expected {}",
-                msg.system_id, self.state.server.system_id
-            );
+            if self.state.server.system_id != 0 {
+                warn!(
+                    "unexpected system ID arrived, got {}, expected {}",
+                    msg.system_id, self.state.server.system_id
+                );
+            }
 
             let mut state = self.state.clone();
             state.server.system_id = msg.system_id;
@@ -653,8 +650,9 @@ where
         }
 
         info!(
-            "processed greeting from proposer {:?}, sending term {:?}",
-            msg.proposer_id, self.state.acceptor_state.term
+            "processed greeting from walproposer {}, sending term {:?}",
+            msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
+            self.state.acceptor_state.term
         );
         Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
             term: self.state.acceptor_state.term,
@@ -727,6 +725,24 @@ where
             return Ok(None);
         }
 
+        // This might happen in a rare race when another (old) connection from
+        // the same walproposer writes + flushes WAL after this connection
+        // already sent flush_lsn in VoteRequest. It is generally safe to
+        // proceed, but to prevent commit_lsn surprisingly going down we should
+        // either refuse the session (simpler) or skip the part we already have
+        // from the stream (can be implemented).
+        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
+            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
+                   msg.term, self.flush_lsn(), msg.start_streaming_at)
+        }
+        // Otherwise this shouldn't happen.
+        assert!(
+            msg.start_streaming_at >= self.inmem.commit_lsn,
+            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
+            msg.start_streaming_at,
+            self.inmem.commit_lsn
+        );
+
         // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
         // intersection of our history and history from msg
 
@@ -759,7 +775,6 @@ where
             // NB: on new clusters, this happens at the same time as
             // timeline_start_lsn initialization, it is taken outside to provide
             // upgrade.
-            self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn);
             self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn);
 
             // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
@@ -778,10 +793,21 @@ where
         Ok(None)
     }
 
-    /// Advance commit_lsn taking into account what we have locally
-    fn update_commit_lsn(&mut self) -> Result<()> {
-        let commit_lsn = min(self.global_commit_lsn, self.flush_lsn());
-        assert!(commit_lsn >= self.inmem.commit_lsn);
+    /// Advance commit_lsn taking into account what we have locally.
+    ///
+    /// Note: it is assumed that 'WAL we have is from the right term' check has
+    /// already been done outside.
+    fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
+        // Both peers and walproposer communicate this value, we might already
+        // have a fresher (higher) version.
+        candidate = max(candidate, self.inmem.commit_lsn);
+        let commit_lsn = min(candidate, self.flush_lsn());
+        assert!(
+            commit_lsn >= self.inmem.commit_lsn,
+            "commit_lsn monotonicity violated: old={} new={}",
+            self.inmem.commit_lsn,
+            commit_lsn
+        );
 
         self.inmem.commit_lsn = commit_lsn;
 
@@ -847,14 +873,11 @@ where
             self.wal_store.flush_wal()?;
         }
 
-        // Update global_commit_lsn
+        // Update commit_lsn.
         if msg.h.commit_lsn != Lsn(0) {
-            // We also obtain commit lsn from peers, so value arrived here might be stale (less)
-            self.global_commit_lsn = max(self.global_commit_lsn, msg.h.commit_lsn);
+            self.update_commit_lsn(msg.h.commit_lsn)?;
         }
-
         self.inmem.peer_horizon_lsn = msg.h.truncate_lsn;
-        self.update_commit_lsn()?;
 
         // Update truncate and commit LSN in control file.
         // To avoid negative impact on performance of extra fsync, do it only
@@ -886,10 +909,6 @@ where
     /// Flush WAL to disk. Return AppendResponse with latest LSNs.
     fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
         self.wal_store.flush_wal()?;
-
-        // commit_lsn can be updated because we have new flushed data locally.
-        self.update_commit_lsn()?;
-
         Ok(Some(AcceptorProposerMessage::AppendResponse(
             self.append_response(),
         )))
@@ -904,8 +923,7 @@ where
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
             if sk_info.last_log_term == self.get_epoch() {
-                self.global_commit_lsn = max(Lsn(sk_info.commit_lsn), self.global_commit_lsn);
-                self.update_commit_lsn()?;
+                self.update_commit_lsn(Lsn(sk_info.commit_lsn))?;
             }
         }
 
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index a3481430d0..20600ab694 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::timeline::{ReplicaState, Timeline};
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
-use anyhow::{bail, Context, Result};
+use anyhow::Context;
 
 use bytes::Bytes;
 use postgres_ffi::get_current_timestamp;
@@ -15,7 +15,8 @@ use std::cmp::min;
 use std::net::Shutdown;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{str, thread};
+use std::{io, str, thread};
+use utils::postgres_backend_async::QueryError;
 
 use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody};
 use tokio::sync::watch::Receiver;
@@ -91,7 +92,7 @@ impl ReplicationConn {
     fn background_thread(
         mut stream_in: ReadStream,
         replica_guard: Arc<ReplicationConnGuard>,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let replica_id = replica_guard.replica;
         let timeline = &replica_guard.timeline;
 
@@ -140,7 +141,7 @@ impl ReplicationConn {
                     // Shutdown the connection, because rust-postgres client cannot be dropped
                     // when connection is alive.
                     let _ = stream_in.shutdown(Shutdown::Both);
-                    bail!("Copy failed");
+                    anyhow::bail!("Copy failed");
                 }
                 _ => {
                     // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored.
@@ -160,8 +161,8 @@ impl ReplicationConn {
         spg: &mut SafekeeperPostgresHandler,
         pgb: &mut PostgresBackend,
         mut start_pos: Lsn,
-    ) -> Result<()> {
-        let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered();
+    ) -> Result<(), QueryError> {
+        let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered();
 
         let tli = GlobalTimelines::get(spg.ttid)?;
 
@@ -256,8 +257,10 @@ impl ReplicationConn {
                         // to right pageserver.
                         if tli.should_walsender_stop(replica_id) {
                             // Shut down, timeline is suspended.
-                            // TODO create proper error type for this
-                            bail!("end streaming to {:?}", spg.appname);
+                            return Err(QueryError::from(io::Error::new(
+                                io::ErrorKind::ConnectionAborted,
+                                format!("end streaming to {:?}", spg.appname),
+                            )));
                         }
 
                         // timeout expired: request pageserver status
@@ -265,8 +268,7 @@ impl ReplicationConn {
                             sent_ptr: end_pos.0,
                             timestamp: get_current_timestamp(),
                             request_reply: true,
-                        }))
-                        .context("Failed to send KeepAlive message")?;
+                        }))?;
                         continue;
                     }
                 }
@@ -301,7 +303,7 @@ impl ReplicationConn {
 const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
 
 // Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn.
-async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> Result<Option<Lsn>> {
+async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
     let commit_lsn: Lsn = *rx.borrow();
     if commit_lsn > lsn {
         return Ok(Some(commit_lsn));
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index ae4d4cce09..fc971ca753 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -346,9 +346,7 @@ impl WalBackupTask {
                         backup_lsn, commit_lsn, e
                     );
 
-                    if retry_attempt < u32::MAX {
-                        retry_attempt += 1;
-                    }
+                    retry_attempt = retry_attempt.saturating_add(1);
                 }
             }
         }
@@ -387,7 +385,7 @@ async fn backup_single_segment(
 ) -> Result<()> {
     let segment_file_path = seg.file_path(timeline_dir)?;
     let remote_segment_path = segment_file_path
-        .strip_prefix(&workspace_dir)
+        .strip_prefix(workspace_dir)
         .context("Failed to strip workspace dir prefix")
         .and_then(RemotePath::new)
         .with_context(|| {
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index fd8f9d9dcf..3ca651d060 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -2,35 +2,28 @@
 //!   WAL service listens for client connections and
 //!   receive WAL from wal_proposer and send it to WAL receivers
 //!
-use anyhow::Result;
 use regex::Regex;
 use std::net::{TcpListener, TcpStream};
-use std::sync::Arc;
 use std::thread;
 use tracing::*;
-use utils::auth::JwtAuth;
+use utils::postgres_backend_async::QueryError;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::SafeKeeperConf;
 use utils::postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(
-    conf: SafeKeeperConf,
-    listener: TcpListener,
-    auth: Option<Arc<JwtAuth>>,
-) -> Result<()> {
+pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! {
     loop {
         match listener.accept() {
             Ok((socket, peer_addr)) => {
                 debug!("accepted connection from {}", peer_addr);
                 let conf = conf.clone();
 
-                let auth = auth.clone();
                 let _ = thread::Builder::new()
                     .name("WAL service thread".into())
                     .spawn(move || {
-                        if let Err(err) = handle_socket(socket, conf, auth) {
+                        if let Err(err) = handle_socket(socket, conf) {
                             error!("connection handler exited: {}", err);
                         }
                     })
@@ -51,25 +44,17 @@ fn get_tid() -> u64 {
 
 /// This is run by `thread_main` above, inside a background thread.
 ///
-fn handle_socket(
-    socket: TcpStream,
-    conf: SafeKeeperConf,
-    auth: Option<Arc<JwtAuth>>,
-) -> Result<()> {
+fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> {
     let _enter = info_span!("", tid = ?get_tid()).entered();
 
     socket.set_nodelay(true)?;
 
-    let mut conn_handler = SafekeeperPostgresHandler::new(conf, auth.clone());
-    let pgbackend = PostgresBackend::new(
-        socket,
-        match auth {
-            None => AuthType::Trust,
-            Some(_) => AuthType::NeonJWT,
-        },
-        None,
-        false,
-    )?;
+    let auth_type = match conf.auth {
+        None => AuthType::Trust,
+        Some(_) => AuthType::NeonJWT,
+    };
+    let mut conn_handler = SafekeeperPostgresHandler::new(conf);
+    let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?;
     // libpq replication protocol between safekeeper and replicas/pagers
     pgbackend.run(&mut conn_handler)?;
 
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 52368bb719..41457868fe 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -223,7 +223,7 @@ impl PhysicalStorage {
             // Rename partial file to completed file
             let (wal_file_path, wal_file_partial_path) =
                 wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(&wal_file_partial_path, &wal_file_path)?;
+            fs::rename(wal_file_partial_path, wal_file_path)?;
         } else {
             // otherwise, file can be reused later
             self.file = Some(file);
@@ -249,7 +249,7 @@ impl PhysicalStorage {
 
         while !buf.is_empty() {
             // Extract WAL location for this block
-            let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize;
+            let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size);
             let segno = self.write_lsn.segment_number(self.wal_seg_size);
 
             // If crossing a WAL boundary, only write up until we reach wal segment size.
@@ -366,7 +366,7 @@ impl Storage for PhysicalStorage {
             self.fdatasync_file(&mut unflushed_file)?;
         }
 
-        let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = end_pos.segment_offset(self.wal_seg_size);
         let segno = end_pos.segment_number(self.wal_seg_size);
 
         // Remove all segments after the given LSN.
@@ -383,7 +383,7 @@ impl Storage for PhysicalStorage {
             // Make segment partial once again
             let (wal_file_path, wal_file_partial_path) =
                 wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(&wal_file_path, &wal_file_partial_path)?;
+            fs::rename(wal_file_path, wal_file_partial_path)?;
         }
 
         // Update LSNs
@@ -416,7 +416,7 @@ fn remove_segments_from_disk(
     let mut min_removed = u64::MAX;
     let mut max_removed = u64::MIN;
 
-    for entry in fs::read_dir(&timeline_dir)? {
+    for entry in fs::read_dir(timeline_dir)? {
         let entry = entry?;
         let entry_path = entry.path();
         let fname = entry_path.file_name().unwrap();
@@ -499,7 +499,7 @@ impl WalReader {
 
         // How much to read and send in message? We cannot cross the WAL file
         // boundary, and we don't want send more than provided buffer.
-        let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let send_size = min(buf.len(), self.wal_seg_size - xlogoff);
 
         // Read some data from the file.
@@ -518,7 +518,7 @@ impl WalReader {
 
     /// Open WAL segment at the current position of the reader.
     async fn open_segment(&self) -> Result<Pin<Box<dyn AsyncRead>>> {
-        let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
         let wal_file_path = self.timeline_dir.join(wal_file_name);
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 1734038661..d83a74ae14 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -318,14 +318,8 @@ def remote_consistent_lsn(
     detail = pageserver_http_client.timeline_detail(tenant, timeline)
 
     lsn_str = detail["remote_consistent_lsn"]
-    if lsn_str is None:
-        # No remote information at all. This happens right after creating
-        # a timeline, before any part of it has been uploaded to remote
-        # storage yet.
-        return 0
-    else:
-        assert isinstance(lsn_str, str)
-        return lsn_from_hex(lsn_str)
+    assert isinstance(lsn_str, str)
+    return lsn_from_hex(lsn_str)
 
 
 def wait_for_upload(
@@ -448,15 +442,15 @@ def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
 
 
 def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
-    conn = psycopg2.connect(pageserver_connstr)
-    conn.autocommit = True
-    with conn.cursor() as cur:
-        cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
-        cur.execute(cmd)
-        res = cur.fetchone()
-        prev_lsn = res[0]
-        last_lsn = res[1]
-    conn.close()
+    with closing(psycopg2.connect(pageserver_connstr)) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
+            cur.execute(cmd)
+            res = cur.fetchone()
+            assert res is not None
+            prev_lsn = res[0]
+            last_lsn = res[1]
 
     return last_lsn, prev_lsn
 
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 7aa33a5234..180c506254 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -2,6 +2,7 @@
 name = "storage_broker"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [features]
 bench = []
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 73141318b8..f3544a7cb8 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -88,7 +88,7 @@ fn tli_from_u64(i: u64) -> Vec<u8> {
 async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
 
     let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
@@ -112,7 +112,7 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
 async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
     let mut counter: u64 = 0;
 
@@ -152,7 +152,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     let h = tokio::spawn(progress_reporter(counters.clone()));
 
-    let c = storage_broker::connect(DEFAULT_ENDPOINT).unwrap();
+    let c = storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap();
 
     for i in 0..args.num_subs {
         let c = Some(c.clone());
@@ -160,7 +160,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     for _i in 0..args.num_pubs {
         let c = None;
-        tokio::spawn(publish(c, args.num_subs as u64));
+        tokio::spawn(publish(c, args.num_subs));
     }
 
     h.await?;
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 1a743394ad..6d80e96bf1 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -39,7 +39,9 @@ use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
 use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
 use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
-use storage_broker::{parse_proto_ttid, EitherBody, DEFAULT_LISTEN_ADDR};
+use storage_broker::{
+    parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
+};
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
@@ -47,8 +49,8 @@ use utils::sentry_init::{init_sentry, release_name};
 
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_CHAN_SIZE: usize = 128;
-const DEFAULT_HTTP2_KEEPALIVE_INTERVAL: &str = "5000ms";
+const DEFAULT_CHAN_SIZE: usize = 32;
+const DEFAULT_ALL_KEYS_CHAN_SIZE: usize = 16384;
 
 #[derive(Parser, Debug)]
 #[command(version = GIT_VERSION, about = "Broker for neon storage nodes communication", long_about = None)]
@@ -56,11 +58,14 @@ struct Args {
     /// Endpoint to listen on.
     #[arg(short, long, default_value = DEFAULT_LISTEN_ADDR)]
     listen_addr: SocketAddr,
-    /// Size of the queue to the subscriber.
+    /// Size of the queue to the per timeline subscriber.
     #[arg(long, default_value_t = DEFAULT_CHAN_SIZE)]
-    chan_size: usize,
+    timeline_chan_size: usize,
+    /// Size of the queue to the all keys subscriber.
+    #[arg(long, default_value_t = DEFAULT_ALL_KEYS_CHAN_SIZE)]
+    all_keys_chan_size: usize,
     /// HTTP/2 keepalive interval.
-    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HTTP2_KEEPALIVE_INTERVAL)]
+    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_KEEPALIVE_INTERVAL)]
     http2_keepalive_interval: Duration,
     /// Format for logging, either 'plain' or 'json'.
     #[arg(long, default_value = "plain")]
@@ -108,7 +113,7 @@ struct SharedState {
 }
 
 impl SharedState {
-    pub fn new(chan_size: usize) -> Self {
+    pub fn new(all_keys_chan_size: usize) -> Self {
         SharedState {
             next_pub_id: 0,
             num_pubs: 0,
@@ -116,7 +121,7 @@ impl SharedState {
             num_subs_to_timelines: 0,
             chans_to_timeline_subs: HashMap::new(),
             num_subs_to_all: 0,
-            chan_to_all_subs: broadcast::channel(chan_size).0,
+            chan_to_all_subs: broadcast::channel(all_keys_chan_size).0,
         }
     }
 
@@ -139,7 +144,7 @@ impl SharedState {
     pub fn register_subscriber(
         &mut self,
         sub_key: SubscriptionKey,
-        chan_size: usize,
+        timeline_chan_size: usize,
     ) -> (SubId, broadcast::Receiver<SafekeeperTimelineInfo>) {
         let sub_id = self.next_sub_id;
         self.next_sub_id += 1;
@@ -158,7 +163,7 @@ impl SharedState {
                     self.chans_to_timeline_subs
                         .entry(ttid)
                         .or_insert(ChanToTimelineSub {
-                            chan: broadcast::channel(chan_size).0,
+                            chan: broadcast::channel(timeline_chan_size).0,
                             num_subscribers: 0,
                         });
                 chan_to_timeline_sub.num_subscribers += 1;
@@ -200,7 +205,7 @@ impl SharedState {
 #[derive(Clone)]
 struct Registry {
     shared_state: Arc<RwLock<SharedState>>,
-    chan_size: usize,
+    timeline_chan_size: usize,
 }
 
 impl Registry {
@@ -232,7 +237,7 @@ impl Registry {
         let (sub_id, sub_rx) = self
             .shared_state
             .write()
-            .register_subscriber(sub_key, self.chan_size);
+            .register_subscriber(sub_key, self.timeline_chan_size);
         info!(
             "subscription started id={}, key={:?}, addr={:?}",
             sub_id, sub_key, remote_addr
@@ -369,9 +374,9 @@ impl BrokerService for Broker {
                     Err(RecvError::Lagged(skipped_msg)) => {
                         missed_msgs += skipped_msg;
                         if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) {
-                            error!("subscription id={}, key={:?}, addr={:?} dropped {} messages, channel is full",
+                            warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
                                 subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
-                            Err(Status::new(Code::Internal, "full channel"))?;
+                            missed_msgs = 0;
                         }
                     }
                     Err(RecvError::Closed) => {
@@ -428,8 +433,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     info!("version: {GIT_VERSION}");
 
     let registry = Registry {
-        shared_state: Arc::new(RwLock::new(SharedState::new(args.chan_size))),
-        chan_size: args.chan_size,
+        shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))),
+        timeline_chan_size: args.timeline_chan_size,
     };
     let storage_broker_impl = Broker {
         registry: registry.clone(),
@@ -523,7 +528,7 @@ mod tests {
     async fn test_registry() {
         let registry = Registry {
             shared_state: Arc::new(RwLock::new(SharedState::new(16))),
-            chan_size: 16,
+            timeline_chan_size: 16,
         };
 
         // subscribe to timeline 2
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 0629caa2fb..8441aaf625 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -1,6 +1,7 @@
 use hyper::body::HttpBody;
 use std::pin::Pin;
 use std::task::{Context, Poll};
+use std::time::Duration;
 use tonic::codegen::StdError;
 use tonic::transport::{ClientTlsConfig, Endpoint};
 use tonic::{transport::Channel, Code, Status};
@@ -12,6 +13,10 @@ use proto::{
 
 // Code generated by protobuf.
 pub mod proto {
+    // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]`
+    // we don't use these types for anything but broker data transmission,
+    // so it's ok to ignore this one.
+    #![allow(clippy::derive_partial_eq_without_eq)]
     tonic::include_proto!("storage_broker");
 }
 
@@ -26,6 +31,8 @@ pub use hyper::Uri;
 pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
 
+pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
+
 // BrokerServiceClient charged with tonic provided Channel transport; helps to
 // avoid depending on tonic directly in user crates.
 pub type BrokerClientChannel = BrokerServiceClient<Channel>;
@@ -33,7 +40,7 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
 // validated here.
-pub fn connect<U>(endpoint: U) -> anyhow::Result<BrokerClientChannel>
+pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
 where
     U: std::convert::TryInto<Uri>,
     U::Error: std::error::Error + Send + Sync + 'static,
@@ -46,6 +53,10 @@ where
         let tls = ClientTlsConfig::new();
         tonic_endpoint = tonic_endpoint.tls_config(tls)?;
     }
+    tonic_endpoint = tonic_endpoint
+        .http2_keep_alive_interval(keepalive_interval)
+        .keep_alive_while_idle(true);
+    //  keep_alive_timeout is 20s by default on both client and server side
     let channel = tonic_endpoint.connect_lazy();
     Ok(BrokerClientChannel::new(channel))
 }
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 27fb0a60b2..b1489b7ab1 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -11,7 +11,7 @@ from datetime import datetime
 from pathlib import Path
 
 # Type-related stuff
-from typing import Callable, ClassVar, Iterator, Optional
+from typing import Callable, ClassVar, Dict, Iterator, Optional
 
 import pytest
 from _pytest.config import Config
@@ -135,23 +135,26 @@ class PgBenchRunResult:
 
 @dataclasses.dataclass
 class PgBenchInitResult:
-    REGEX: ClassVar[re.Pattern] = re.compile(  # type: ignore[type-arg]
-        r"done in (\d+\.\d+) s "
-        r"\("
-        r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
-        r"(?:create tables (\d+\.\d+) s)?(?:, )?"
-        r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
-        r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
-        r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
-        r"\)\."
-    )
+    # Taken from https://github.com/postgres/postgres/blob/REL_15_1/src/bin/pgbench/pgbench.c#L5144-L5171
+    EXTRACTORS: ClassVar[Dict[str, re.Pattern]] = {  # type: ignore[type-arg]
+        "drop_tables": re.compile(r"drop tables (\d+\.\d+) s"),
+        "create_tables": re.compile(r"create tables (\d+\.\d+) s"),
+        "client_side_generate": re.compile(r"client-side generate (\d+\.\d+) s"),
+        "server_side_generate": re.compile(r"server-side generate (\d+\.\d+) s"),
+        "vacuum": re.compile(r"vacuum (\d+\.\d+) s"),
+        "primary_keys": re.compile(r"primary keys (\d+\.\d+) s"),
+        "foreign_keys": re.compile(r"foreign keys (\d+\.\d+) s"),
+        "total": re.compile(r"done in (\d+\.\d+) s"),  # Total time printed by pgbench
+    }
 
-    total: float
+    total: Optional[float]
     drop_tables: Optional[float]
     create_tables: Optional[float]
     client_side_generate: Optional[float]
+    server_side_generate: Optional[float]
     vacuum: Optional[float]
     primary_keys: Optional[float]
+    foreign_keys: Optional[float]
     duration: float
     start_timestamp: int
     end_timestamp: int
@@ -164,25 +167,35 @@ class PgBenchInitResult:
         start_timestamp: int,
         end_timestamp: int,
     ):
-        # Parses pgbench initialize output for default initialization steps (dtgvp)
+        # Parses pgbench initialize output
         # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s).
 
         last_line = stderr.splitlines()[-1]
 
-        if (m := cls.REGEX.match(last_line)) is not None:
-            total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [
-                float(v) for v in m.groups() if v is not None
-            ]
-        else:
+        timings: Dict[str, Optional[float]] = {}
+        last_line_items = re.split(r"\(|\)|,", last_line)
+        for item in last_line_items:
+            for key, regex in cls.EXTRACTORS.items():
+                if (m := regex.match(item.strip())) is not None:
+                    if key in timings:
+                        raise RuntimeError(
+                            f"can't store pgbench results for repeated action `{key}`"
+                        )
+
+                    timings[key] = float(m.group(1))
+
+        if not timings or "total" not in timings:
             raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`")
 
         return cls(
-            total=total,
-            drop_tables=drop_tables,
-            create_tables=create_tables,
-            client_side_generate=client_side_generate,
-            vacuum=vacuum,
-            primary_keys=primary_keys,
+            total=timings["total"],
+            drop_tables=timings.get("drop_tables", 0.0),
+            create_tables=timings.get("create_tables", 0.0),
+            client_side_generate=timings.get("client_side_generate", 0.0),
+            server_side_generate=timings.get("server_side_generate", 0.0),
+            vacuum=timings.get("vacuum", 0.0),
+            primary_keys=timings.get("primary_keys", 0.0),
+            foreign_keys=timings.get("foreign_keys", 0.0),
             duration=duration,
             start_timestamp=start_timestamp,
             end_timestamp=end_timestamp,
@@ -326,8 +339,10 @@ class NeonBenchmarker:
             "drop_tables",
             "create_tables",
             "client_side_generate",
+            "server_side_generate",
             "vacuum",
             "primary_keys",
+            "foreign_keys",
         ]
         for metric in metrics:
             if (value := getattr(result, metric)) is not None:
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 291f924379..be1f146735 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -115,6 +115,7 @@ class NeonCompare(PgCompare):
         return self._pg_bin
 
     def flush(self):
+        self.pageserver_http_client.timeline_checkpoint(self.env.initial_tenant, self.timeline)
         self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0)
 
     def compact(self):
@@ -176,7 +177,7 @@ class VanillaCompare(PgCompare):
         self.cur = self.conn.cursor()
 
     @property
-    def pg(self) -> PgProtocol:
+    def pg(self) -> VanillaPostgres:
         return self._pg
 
     @property
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 86ab4425ed..8b78e06c22 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -39,9 +39,16 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
     return metrics
 
 
+PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
+    "pageserver_remote_timeline_client_calls_unfinished",
+    *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
+    *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
+    "pageserver_remote_physical_size",
+)
+
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
-    "pageserver_current_physical_size",
+    "pageserver_resident_physical_size",
     "pageserver_getpage_reconstruct_seconds_bucket",
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",
@@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
+    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 818853a4ac..f284be8753 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -18,6 +18,7 @@ from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from enum import Flag, auto
 from functools import cached_property
+from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
@@ -26,6 +27,7 @@ import asyncpg
 import backoff  # type: ignore
 import boto3
 import jwt
+import prometheus_client
 import psycopg2
 import pytest
 import requests
@@ -33,6 +35,7 @@ from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -41,6 +44,7 @@ from fixtures.utils import (
     get_self_dir,
     subprocess_capture,
 )
+from prometheus_client.parser import text_string_to_metric_families
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -286,24 +290,19 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def default_broker(
-    request: FixtureRequest,
     port_distributor: PortDistributor,
-    top_output_dir: Path,
+    test_output_dir: Path,
     neon_binpath: Path,
 ) -> Iterator[NeonBroker]:
     # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
-    broker_logfile = (
-        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
-    )
-    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
+    broker_logfile = test_output_dir / "repo" / "storage_broker.log"
 
     broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
     broker.stop()
-    allure_attach_from_dir(Path(broker_logfile))
 
 
 @pytest.fixture(scope="session")
@@ -598,6 +597,7 @@ class NeonEnvBuilder:
         rust_log_override: Optional[str] = None,
         default_branch_name: str = DEFAULT_BRANCH_NAME,
         preserve_database_files: bool = False,
+        initial_tenant: Optional[TenantId] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -620,19 +620,30 @@ class NeonEnvBuilder:
         self.pg_distrib_dir = pg_distrib_dir
         self.pg_version = pg_version
         self.preserve_database_files = preserve_database_files
+        self.initial_tenant = initial_tenant or TenantId.generate()
 
-    def init(self) -> NeonEnv:
+    def init_configs(self) -> NeonEnv:
         # Cannot create more than one environment from one builder
         assert self.env is None, "environment already initialized"
         self.env = NeonEnv(self)
         return self.env
 
     def start(self):
+        assert self.env is not None, "environment is not already initialized, call init() first"
         self.env.start()
 
     def init_start(self) -> NeonEnv:
-        env = self.init()
+        env = self.init_configs()
         self.start()
+
+        # Prepare the default branch to start the postgres on later.
+        # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
+        log.info(
+            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
+        )
+        initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
+
         return env
 
     def enable_remote_storage(
@@ -756,6 +767,11 @@ class NeonEnvBuilder:
             log.info("no remote storage was set up, skipping cleanup")
             return
 
+        # Making mypy happy with allowing only `S3Storage` further.
+        # `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
+        # so this line effectively a no-op
+        assert isinstance(self.remote_storage, S3Storage)
+
         if self.keep_remote_storage_contents:
             log.info("keep_remote_storage_contents skipping remote storage cleanup")
             return
@@ -771,7 +787,8 @@ class NeonEnvBuilder:
             Prefix=self.remote_storage_prefix,
         )
 
-        objects_to_delete = {"Objects": []}
+        # Using Any because DeleteTypeDef (from boto3-stubs) doesn't fit our case
+        objects_to_delete: Any = {"Objects": []}
         cnt = 0
         for item in pages.search("Contents"):
             # weirdly when nothing is found it returns [None]
@@ -786,16 +803,17 @@ class NeonEnvBuilder:
                     Bucket=self.remote_storage.bucket_name,
                     Delete=objects_to_delete,
                 )
-                objects_to_delete = dict(Objects=[])
+                objects_to_delete = {"Objects": []}
                 cnt += 1
 
         # flush rest
         if len(objects_to_delete["Objects"]):
             self.remote_storage_client.delete_objects(
-                Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete
+                Bucket=self.remote_storage.bucket_name,
+                Delete=objects_to_delete,
             )
 
-        log.info("deleted %s objects from remote storage", cnt)
+        log.info(f"deleted {cnt} objects from remote storage")
 
     def __enter__(self) -> "NeonEnvBuilder":
         return self
@@ -884,12 +902,12 @@ class NeonEnv:
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
         # so that we don't need to dig it out of the config file afterwards.
-        self.initial_tenant = TenantId.generate()
+        self.initial_tenant = config.initial_tenant
 
         # Create a config file corresponding to the options
         toml = textwrap.dedent(
             f"""
-            default_tenant_id = '{self.initial_tenant}'
+            default_tenant_id = '{config.initial_tenant}'
         """
         )
 
@@ -1012,7 +1030,7 @@ def _shared_simple_env(
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
         # Create the environment in the per-test output directory
-        repo_dir = get_test_output_dir(request, top_output_dir) / "repo"
+        repo_dir = get_test_repo_dir(request, top_output_dir)
     else:
         # We're running shared fixtures. Share a single directory.
         repo_dir = top_output_dir / "shared_repo"
@@ -1201,8 +1219,22 @@ class PageserverHttpClient(requests.Session):
         # there are no tests for those right now.
         return size
 
-    def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline")
+    def timeline_list(
+        self,
+        tenant_id: TenantId,
+        include_non_incremental_logical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
+    ) -> List[Dict[str, Any]]:
+
+        params = {}
+        if include_non_incremental_logical_size:
+            params["include-non-incremental-logical-size"] = "yes"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "yes"
+
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
+        )
         self.verbose_error(res)
         res_json = res.json()
         assert isinstance(res_json, list)
@@ -1236,13 +1268,13 @@ class PageserverHttpClient(requests.Session):
         tenant_id: TenantId,
         timeline_id: TimelineId,
         include_non_incremental_logical_size: bool = False,
-        include_non_incremental_physical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
     ) -> Dict[Any, Any]:
         params = {}
         if include_non_incremental_logical_size:
             params["include-non-incremental-logical-size"] = "yes"
-        if include_non_incremental_physical_size:
-            params["include-non-incremental-physical-size"] = "yes"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "yes"
 
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
@@ -1317,11 +1349,115 @@ class PageserverHttpClient(requests.Session):
         res_json = res.json()
         assert res_json is None
 
+    def timeline_spawn_download_remote_layers(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> dict[str, Any]:
+
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        spawn_response: dict[str, Any],
+        poll_state=None,
+    ) -> None | dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+
+        # assumption in this API client here is that nobody else spawns the task
+        assert res_json["task_id"] == spawn_response["task_id"]
+
+        if poll_state is None or res_json["state"] == poll_state:
+            return res_json
+        return None
+
+    def timeline_download_remote_layers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        errors_ok=False,
+        at_least_one_download=True,
+    ):
+        res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id)
+        while True:
+            completed = self.timeline_poll_download_remote_layers_status(
+                tenant_id, timeline_id, res, poll_state="Completed"
+            )
+            if not completed:
+                time.sleep(0.1)
+                continue
+            if not errors_ok:
+                assert completed["failed_download_count"] == 0
+            if at_least_one_download:
+                assert completed["successful_download_count"] > 0
+            return completed
+
     def get_metrics(self) -> str:
         res = self.get(f"http://localhost:{self.port}/metrics")
         self.verbose_error(res)
         return res.text
 
+    def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str):
+        raw = self.get_metrics()
+        family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw))
+        [metric] = [m for m in family if m.name == metric_name]
+        [sample] = [
+            s
+            for s in metric.samples
+            if s.labels["tenant_id"] == str(tenant_id)
+            and s.labels["timeline_id"] == str(timeline_id)
+        ]
+        return sample.value
+
+    def get_remote_timeline_client_metric(
+        self,
+        metric_name: str,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        file_kind: str,
+        op_kind: str,
+    ) -> Optional[float]:
+        metrics = parse_metrics(self.get_metrics(), "pageserver")
+        matches = metrics.query_all(
+            name=metric_name,
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        )
+        if len(matches) == 0:
+            value = None
+        elif len(matches) == 1:
+            value = matches[0].value
+            assert value is not None
+        else:
+            assert len(matches) < 2, "above filter should uniquely identify metric"
+        return value
+
+    def get_metric_value(self, name: str) -> Optional[str]:
+        metrics = self.get_metrics()
+        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
+        if len(relevant) == 0:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(relevant) == 1
+        return relevant[0].lstrip(name).strip()
+
 
 @dataclass
 class PageserverPort:
@@ -1432,6 +1568,7 @@ class NeonCli(AbstractNeonCli):
         tenant_id: Optional[TenantId] = None,
         timeline_id: Optional[TimelineId] = None,
         conf: Optional[Dict[str, str]] = None,
+        set_default: bool = False,
     ) -> Tuple[TenantId, TimelineId]:
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -1440,47 +1577,51 @@ class NeonCli(AbstractNeonCli):
             tenant_id = TenantId.generate()
         if timeline_id is None:
             timeline_id = TimelineId.generate()
-        if conf is None:
-            res = self.raw_cli(
-                [
-                    "tenant",
-                    "create",
-                    "--tenant-id",
-                    str(tenant_id),
-                    "--timeline-id",
-                    str(timeline_id),
-                    "--pg-version",
-                    self.env.pg_version,
-                ]
-            )
-        else:
-            res = self.raw_cli(
-                [
-                    "tenant",
-                    "create",
-                    "--tenant-id",
-                    str(tenant_id),
-                    "--timeline-id",
-                    str(timeline_id),
-                    "--pg-version",
-                    self.env.pg_version,
-                ]
-                + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
+
+        args = [
+            "tenant",
+            "create",
+            "--tenant-id",
+            str(tenant_id),
+            "--timeline-id",
+            str(timeline_id),
+            "--pg-version",
+            self.env.pg_version,
+        ]
+        if conf is not None:
+            args.extend(
+                chain.from_iterable(
+                    product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
+                )
             )
+        if set_default:
+            args.append("--set-default")
+
+        res = self.raw_cli(args)
         res.check_returncode()
         return tenant_id, timeline_id
 
+    def set_default(self, tenant_id: TenantId):
+        """
+        Update default tenant for future operations that require tenant_id.
+        """
+        res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)])
+        res.check_returncode()
+
     def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]):
         """
         Update tenant config.
         """
-        if conf is None:
-            res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)])
-        else:
-            res = self.raw_cli(
-                ["tenant", "config", "--tenant-id", str(tenant_id)]
-                + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
+
+        args = ["tenant", "config", "--tenant-id", str(tenant_id)]
+        if conf is not None:
+            args.extend(
+                chain.from_iterable(
+                    product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
+                )
             )
+
+        res = self.raw_cli(args)
         res.check_returncode()
 
     def list_tenants(self) -> "subprocess.CompletedProcess[str]":
@@ -1515,36 +1656,6 @@ class NeonCli(AbstractNeonCli):
 
         return TimelineId(str(created_timeline_id))
 
-    def create_root_branch(
-        self,
-        branch_name: str,
-        tenant_id: Optional[TenantId] = None,
-    ):
-        cmd = [
-            "timeline",
-            "create",
-            "--branch-name",
-            branch_name,
-            "--tenant-id",
-            str(tenant_id or self.env.initial_tenant),
-            "--pg-version",
-            self.env.pg_version,
-        ]
-
-        res = self.raw_cli(cmd)
-        res.check_returncode()
-
-        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
-
-        created_timeline_id = None
-        if matches is not None:
-            created_timeline_id = matches.group("timeline_id")
-
-        if created_timeline_id is None:
-            raise Exception("could not find timeline id after `neon timeline create` invocation")
-        else:
-            return TimelineId(created_timeline_id)
-
     def create_branch(
         self,
         new_branch_name: str = DEFAULT_BRANCH_NAME,
@@ -1600,17 +1711,12 @@ class NeonCli(AbstractNeonCli):
     def init(
         self,
         config_toml: str,
-        initial_timeline_id: Optional[TimelineId] = None,
     ) -> "subprocess.CompletedProcess[str]":
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
             tmp.write(config_toml)
             tmp.flush()
 
-            cmd = ["init", f"--config={tmp.name}"]
-            if initial_timeline_id:
-                cmd.extend(["--timeline-id", str(initial_timeline_id)])
-
-            cmd.extend(["--pg-version", self.env.pg_version])
+            cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
 
             append_pageserver_param_overrides(
                 params_to_update=cmd,
@@ -1619,7 +1725,12 @@ class NeonCli(AbstractNeonCli):
                 pageserver_config_override=self.env.pageserver.config_override,
             )
 
-            res = self.raw_cli(cmd)
+            s3_env_vars = None
+            if self.env.remote_storage is not None and isinstance(
+                self.env.remote_storage, S3Storage
+            ):
+                s3_env_vars = self.env.remote_storage.access_env_vars()
+            res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
             res.check_returncode()
             return res
 
@@ -1742,6 +1853,12 @@ class NeonCli(AbstractNeonCli):
 
         return self.raw_cli(args, check_return_code=check_return_code)
 
+    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
+        return self.raw_cli(["start"], check_return_code=check_return_code)
+
+    def stop(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
+        return self.raw_cli(["stop"], check_return_code=check_return_code)
+
 
 class WalCraft(AbstractNeonCli):
     """
@@ -1796,14 +1913,17 @@ class NeonPageserver(PgProtocol):
             ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
             ".*Shutdown task error: walreceiver connection handling failure.*",
             ".*wal_connection_manager.*tcp connect error: Connection refused.*",
-            ".*query handler for .* failed: Connection reset by peer.*",
-            ".*serving compute connection task.*exited with error: Broken pipe.*",
-            ".*Connection aborted: error communicating with the server: Broken pipe.*",
-            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
-            ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
+            ".*query handler for .* failed: Socket IO error: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Postgres connection error.*",
+            ".*serving compute connection task.*exited with error: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Postgres query error.*",
+            ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*",
+            ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*",
+            ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*",
             ".*kill_and_wait_impl.*: wait successful.*",
-            ".*end streaming to Some.*",
+            ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
+            ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
             ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
@@ -1873,10 +1993,6 @@ class NeonPageserver(PgProtocol):
         if '"testing"' not in self.version:
             pytest.skip("pageserver was built without 'testing' feature")
 
-    def is_profiling_enabled_or_skip(self):
-        if '"profiling"' not in self.version:
-            pytest.skip("pageserver was built without 'profiling' feature")
-
     def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient:
         return PageserverHttpClient(
             port=self.service_port.http,
@@ -1903,6 +2019,28 @@ class NeonPageserver(PgProtocol):
 
         assert not errors
 
+    def log_contains(self, pattern: str) -> Optional[str]:
+        """Check that the pageserver log contains a line that matches the given regex"""
+        logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
+
+        contains_re = re.compile(pattern)
+
+        # XXX: Our rust logging machinery buffers the messages, so if you
+        # call this function immediately after it's been logged, there is
+        # no guarantee it is already present in the log file. This hasn't
+        # been a problem in practice, our python tests are not fast enough
+        # to hit that race condition.
+        while True:
+            line = logfile.readline()
+            if not line:
+                break
+
+            if contains_re.search(line):
+                # found it!
+                return line
+
+        return None
+
 
 def append_pageserver_param_overrides(
     params_to_update: List[str],
@@ -2749,7 +2887,7 @@ class NeonBroker:
         log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
         with open(self.logfile, "wb") as logfile:
             args = [
-                self.neon_binpath / "storage_broker",
+                str(self.neon_binpath / "storage_broker"),
                 f"--listen-addr={listen_addr}",
             ]
             self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
@@ -2785,6 +2923,10 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
+def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
+    return get_test_output_dir(request, top_output_dir) / "repo"
+
+
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--preserve-database-files",
@@ -2961,13 +3103,55 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
-def assert_no_in_progress_downloads_for_tenant(
-    pageserver_http_client: PageserverHttpClient,
-    tenant: TenantId,
+def wait_until(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns successfully, without exception. Returns the
+    last return value from the function.
+    """
+    last_exception = None
+    for i in range(number_of_iterations):
+        try:
+            res = func()
+        except Exception as e:
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            last_exception = e
+            time.sleep(interval)
+            continue
+        return res
+    raise Exception("timed out while waiting for %s" % func) from last_exception
+
+
+def wait_while(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns false, or throws an exception.
+    """
+    for i in range(number_of_iterations):
+        try:
+            if not func():
+                return
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            time.sleep(interval)
+            continue
+        except Exception:
+            return
+    raise Exception("timed out while waiting for %s" % func)
+
+
+def assert_tenant_status(
+    pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str
 ):
     tenant_status = pageserver_http_client.tenant_status(tenant)
-    assert tenant_status["has_in_progress_downloads"] is False, tenant_status
-    assert tenant_status["state"] == "Active"
+    log.info(f"tenant_status: {tenant_status}")
+    assert tenant_status["state"] == expected_status, tenant_status
+
+
+def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId):
+    tenants = ps_http.tenant_list()
+    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
+    assert len(matching) < 2
+    if len(matching) == 0:
+        return None
+    return matching[0]
 
 
 def remote_consistent_lsn(
@@ -2975,14 +3159,15 @@ def remote_consistent_lsn(
 ) -> Lsn:
     detail = pageserver_http_client.timeline_detail(tenant, timeline)
 
-    lsn_str = detail["remote_consistent_lsn"]
-    if lsn_str is None:
+    if detail["remote_consistent_lsn"] is None:
         # No remote information at all. This happens right after creating
         # a timeline, before any part of it has been uploaded to remote
         # storage yet.
         return Lsn(0)
-    assert isinstance(lsn_str, str)
-    return Lsn(lsn_str)
+    else:
+        lsn_str = detail["remote_consistent_lsn"]
+        assert isinstance(lsn_str, str)
+        return Lsn(lsn_str)
 
 
 def wait_for_upload(
@@ -2995,6 +3180,7 @@ def wait_for_upload(
     for i in range(20):
         current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
         if current_lsn >= lsn:
+            log.info("wait finished")
             return
         log.info(
             "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
@@ -3084,3 +3270,34 @@ def fork_at_current_lsn(
     """
     current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
     return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
+
+
+def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    safekeepers: List[Safekeeper],
+    pageserver: NeonPageserver,
+):
+    sk_commit_lsns = [
+        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers
+    ]
+    lsn = max(sk_commit_lsns)
+    ps_http = pageserver.http_client()
+    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn)
+    return lsn
+
+
+def wait_for_sk_commit_lsn_to_reach_remote_storage(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    safekeepers: List[Safekeeper],
+    pageserver: NeonPageserver,
+):
+    lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
+        tenant_id, timeline_id, safekeepers, pageserver
+    )
+    ps_http = pageserver.http_client()
+    # force a checkpoint to trigger upload
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
+    return lsn
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 1fb9eb72e6..df83fc6377 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -148,7 +148,7 @@ def get_scale_for_db(size_mb: int) -> int:
 
 
 ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
+    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
 )
 
 
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index a32ce87c33..c1a57fb28b 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -1,12 +1,8 @@
 # Running locally
 
-First make a release build. The profiling flag is optional, used only for tests that
-generate flame graphs. The `-s` flag just silences a lot of output, and makes it
+First make a release build. The `-s` flag silences a lot of output, and makes it
 easier to see if you have compile errors without scrolling up.
-`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
-
-NOTE: the `profiling` flag only works on linux because we use linux-specific
-libc APIs like `libc::timer_t`.
+`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`
 
 Then run the tests
 `NEON_BIN=./target/release poetry run pytest test_runner/performance"`
diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py
index 01b2097112..a91c78e867 100644
--- a/test_runner/performance/test_copy.py
+++ b/test_runner/performance/test_copy.py
@@ -1,5 +1,6 @@
 from contextlib import closing
 from io import BufferedReader, RawIOBase
+from typing import Optional
 
 from fixtures.compare_fixtures import PgCompare
 
@@ -8,7 +9,7 @@ class CopyTestData(RawIOBase):
     def __init__(self, rows: int):
         self.rows = rows
         self.rownum = 0
-        self.linebuf = None
+        self.linebuf: Optional[bytes] = None
         self.ptr = 0
 
     def readable(self):
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index 015cc40a72..2b8760dff2 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -8,14 +8,14 @@ from typing import Dict, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult
-from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.compare_fixtures import PgCompare
 from fixtures.utils import get_scale_for_db
 
 
 @enum.unique
 class PgBenchLoadType(enum.Enum):
     INIT = "init"
-    SIMPLE_UPDATE = "simple_update"
+    SIMPLE_UPDATE = "simple-update"
     SELECT_ONLY = "select-only"
 
 
@@ -94,7 +94,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
 
     if workload_type == PgBenchLoadType.INIT:
         # Run initialize
-        init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password)
+        init_pgbench(
+            env, ["pgbench", f"-s{scale}", "-i", "-I", "dtGvp", connstr], password=password
+        )
 
     if workload_type == PgBenchLoadType.SIMPLE_UPDATE:
         # Run simple-update workload
@@ -174,28 +176,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int):
     run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY)
 
 
-# Run the pgbench tests, and generate a flamegraph from it
-# This requires that the pageserver was built with the 'profiling' feature.
-#
-# TODO: If the profiling is cheap enough, there's no need to run the same test
-# twice, with and without profiling. But for now, run it separately, so that we
-# can see how much overhead the profiling adds.
-@pytest.mark.parametrize("scale", get_scales_matrix())
-@pytest.mark.parametrize("duration", get_durations_matrix())
-def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int):
-    neon_env_builder.pageserver_config_override = """
-profiling="page_requests"
-"""
-    env = neon_env_builder.init_start()
-    env.pageserver.is_profiling_enabled_or_skip()
-    env.neon_cli.create_branch("empty", "main")
-
-    neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench")
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT)
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
-
-
 # The following 3 tests run on an existing database as it was set up by previous tests,
 # and leaves the database in a state that would be used in the next tests.
 # Modifying the definition order of these functions or adding other remote tests in between will alter results.
diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py
index a61d64553d..bd84724405 100644
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -22,15 +22,16 @@ from pytest_lazyfixture import lazy_fixture  # type: ignore
     ],
 )
 @pytest.mark.parametrize(
-    "env, scale",
+    "env,scale",
     [
         # Run on all envs. Use 200x larger table on remote cluster to make sure
         # it doesn't fit in shared buffers, which are larger on remote than local.
         pytest.param(lazy_fixture("neon_compare"), 1, id="neon"),
         pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"),
-        pytest.param(
-            lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster
-        ),
+        # Reenable after switching per-test projects created via API
+        # pytest.param(
+        #     lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster
+        # ),
     ],
 )
 def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int):
@@ -45,7 +46,7 @@ def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: in
             # Verify that the table is larger than shared_buffers
             cur.execute(
                 """
-            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize
+            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_size
             from pg_settings where name = 'shared_buffers'
             """
             )
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index dfbf956568..cc807b7ff3 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -84,6 +84,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
 
     # Set the GC horizon so that lsn1 is inside the horizon, which means
     # we can create a new branch starting from lsn1.
+    pageserver_http_client.timeline_checkpoint(tenant, timeline_main)
     pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024)
 
     env.neon_cli.create_branch(
@@ -156,6 +157,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     # branch creation task but the individual timeline GC iteration happens *after*
     # the branch creation task.
     pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)"))
+    pageserver_http_client.timeline_checkpoint(tenant, b0)
 
     def do_gc():
         pageserver_http_client.timeline_gc(tenant, b0, 0)
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index a841e3ced2..d19f6a7d39 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -109,6 +109,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
 
     # check that we cannot create branch based on garbage collected data
     with env.pageserver.http_client() as pageserver_http:
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
         print_gc_result(gc_result)
 
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 71964f622f..05d5788028 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*Failed to load delta layer.*",
+            ".*Failed to reconstruct the page.*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -87,9 +87,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
         f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
     )
 
-    # Second timeline has no ancestors, only the metadata file and no layer files.
-    # That is checked explicitly in the pageserver, and causes the tenant to be marked
-    # as broken.
+    # Second timeline has no ancestors, only the metadata file and no layer files locally,
+    # and we don't have the remote storage enabled. It is loaded into memory, but getting
+    # the basebackup from it will fail.
     with pytest.raises(
         Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
     ) as err:
@@ -97,8 +97,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
     log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
 
     # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
-    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+    with pytest.raises(Exception, match="Failed to reconstruct the page") as err:
         pg3.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index 74ee2a89d4..f973bd8e60 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -193,8 +193,8 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             timeout=10,
         )
     except TimeoutExpired as exc:
-        ctl_logs = exc.stderr.decode("utf-8")
-        log.info("compute_ctl output:\n" + ctl_logs)
+        ctl_logs = (exc.stderr or b"").decode("utf-8")
+        log.info("compute_ctl output:\n{ctl_logs}")
 
     with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
         start = "starting safekeepers syncing"
@@ -240,7 +240,7 @@ class ExternalProcessManager:
         with self.pid_file:
             try:
                 os.kill(self.pid, signal.SIGTERM)
-            except os.OsError as e:
+            except OSError as e:
                 if not self.path.is_file():
                     return
                 log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}")
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
old mode 100644
new mode 100755
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 332bef225f..5f052bf81a 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,9 +2,17 @@ import asyncio
 import concurrent.futures
 import random
 
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
-from fixtures.types import TimelineId
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    Postgres,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
 from fixtures.utils import query_scalar
 
 # Test configuration
@@ -35,11 +43,13 @@ async def gc(env: NeonEnv, timeline: TimelineId):
 
     loop = asyncio.get_running_loop()
 
+    def do_gc():
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
+        pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
+
     with concurrent.futures.ThreadPoolExecutor() as pool:
         while updates_performed < updates_to_perform:
-            await loop.run_in_executor(
-                pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
-            )
+            await loop.run_in_executor(pool, do_gc)
 
 
 # At the same time, run UPDATEs and GC
@@ -87,3 +97,81 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
         r = cur.fetchone()
         assert r is not None
         assert r == (num_rows, updates_to_perform)
+
+
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
+
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_gc_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_gc_index_upload", "main")
+    pg = env.postgres.create_start("test_gc_index_upload")
+
+    pageserver_http = env.pageserver.http_client()
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # Sanity check that the metric works
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    before = get_num_remote_ops("index", "upload")
+    assert before > 0
+
+    # Run many cycles of GC. Then check that the number of index files
+    # uploads didn't grow much. In particular we don't want to re-upload the
+    # index file on every GC iteration, when it has no work to do.
+    #
+    # On each iteration, we use a slightly smaller GC horizon, so that the GC
+    # at least needs to check if it has work to do.
+    for i in range(100):
+        cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
+        num_index_uploads = get_num_remote_ops("index", "upload")
+
+        # Also make sure that a no-op compaction doesn't upload the index
+        # file unnecessarily.
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+        log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
+
+    after = num_index_uploads
+    log.info(f"{after-before} new index uploads during test")
+    assert after - before < 5
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 1a99d13a0b..0388e24e98 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -53,10 +53,10 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     unpacked_base = os.path.join(basebackup_dir, "unpacked-base")
     corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar")
     os.mkdir(unpacked_base, 0o750)
-    subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base])
+    subprocess_capture(test_output_dir, ["tar", "-xf", base_tar, "-C", unpacked_base])
     os.remove(os.path.join(unpacked_base, "global/pg_control"))
     subprocess_capture(
-        str(test_output_dir),
+        test_output_dir,
         ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base),
         cwd=unpacked_base,
     )
@@ -306,6 +306,7 @@ def _import(
 
     # Check that gc works
     pageserver_http = env.pageserver.http_client()
+    pageserver_http.timeline_checkpoint(tenant, timeline)
     pageserver_http.timeline_gc(tenant, timeline, 0)
 
     return tar_output_file
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
new file mode 100644
index 0000000000..d1fcab7a62
--- /dev/null
+++ b/test_runner/regress/test_metric_collection.py
@@ -0,0 +1,163 @@
+import time
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PortDistributor,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import query_scalar
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
+initial_tenant = TenantId.generate()
+remote_uploaded = 0
+checks = {
+    "written_size": lambda value: value > 0,
+    "resident_size": lambda value: value >= 0,
+    # >= 0 check here is to avoid race condition when we receive metrics before
+    # remote_uploaded is updated
+    "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+    # logical size may lag behind the actual size, so allow 0 here
+    "timeline_logical_size": lambda value: value >= 0,
+}
+
+metric_kinds_checked = set([])
+
+
+#
+# verify that metrics look minilally sane
+#
+def metrics_handler(request: Request) -> Response:
+    if request.json is None:
+        return Response(status=400)
+
+    events = request.json["events"]
+    log.info("received events:")
+    log.info(events)
+
+    for event in events:
+        assert event["tenant_id"] == str(
+            initial_tenant
+        ), "Expecting metrics only from the initial tenant"
+        metric_name = event["metric"]
+
+        check = checks.get(metric_name)
+        # calm down mypy
+        if check is not None:
+            assert check(event["value"]), f"{metric_name} isn't valid"
+            global metric_kinds_checked
+            metric_kinds_checked.add(metric_name)
+
+    return Response(status=200)
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
+)
+def test_metric_collection(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+    remote_storage_kind: RemoteStorageKind,
+):
+    (host, port) = httpserver_listen_address
+    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+
+    # Require collecting metrics frequently, since we change
+    # the timeline and want something to be logged about it.
+    #
+    # Disable time-based pitr, we will use the manual GC calls
+    # to trigger remote storage operations in a controlled way
+    neon_env_builder.pageserver_config_override = (
+        f"""
+        metric_collection_interval="1s"
+        metric_collection_endpoint="{metric_collection_endpoint}"
+    """
+        + "tenant_config={pitr_interval = '0 sec'}"
+    )
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_metric_collection",
+    )
+
+    log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
+
+    # Set initial tenant of the test, that we expect the logs from
+    global initial_tenant
+    initial_tenant = neon_env_builder.initial_tenant
+    # mock http server that returns OK for the metrics
+    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
+        metrics_handler
+    )
+
+    # spin up neon,  after http server is ready
+    env = neon_env_builder.init_start()
+    # Order of fixtures shutdown is not specified, and if http server gets down
+    # before pageserver, pageserver log might contain such errors in the end.
+    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+    env.neon_cli.create_branch("test_metric_collection")
+    pg = env.postgres.create_start("test_metric_collection")
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # upload some data to remote storage
+    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http = env.pageserver.http_client()
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+        global remote_uploaded
+        remote_uploaded = get_num_remote_ops("index", "upload")
+        assert remote_uploaded > 0
+
+    # wait longer than collecting interval and check that all requests are served
+    time.sleep(3)
+    httpserver.check()
+    global metric_kinds_checked, checks
+    expected_checks = set(checks.keys())
+    assert len(metric_kinds_checked) == len(
+        checks
+    ), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered"
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
new file mode 100644
index 0000000000..bd0f550ba5
--- /dev/null
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -0,0 +1,17 @@
+from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor
+
+
+# Test that neon cli is able to start and stop all processes with the user defaults.
+# Repeats the example from README.md as close as it can
+def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor):
+    env = neon_env_builder.init_configs()
+    # Skipping the init step that creates a local tenant in Pytest tests
+    try:
+        env.neon_cli.start()
+        env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
+        env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port())
+
+        env.neon_cli.create_branch(new_branch_name="migration_check")
+        env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port())
+    finally:
+        env.neon_cli.stop()
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 3e387bb6cc..9885a811e1 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -45,7 +45,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # will cause GetPage requests.
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )
@@ -59,6 +59,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # Make a lot of updates on a single row, generating a lot of WAL. Trigger
     # garbage collections so that the page server will remove old page versions.
     for i in range(10):
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
         print_gc_result(gc_result)
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
new file mode 100644
index 0000000000..184dc13888
--- /dev/null
+++ b/test_runner/regress/test_ondemand_download.py
@@ -0,0 +1,440 @@
+# It's possible to run any regular test with the local fs remote storage via
+# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
+
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    assert_tenant_status,
+    available_remote_storages,
+    wait_for_last_record_lsn,
+    wait_for_sk_commit_lsn_to_reach_remote_storage,
+    wait_for_upload,
+    wait_until,
+)
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar
+
+
+def get_num_downloaded_layers(client, tenant_id, timeline_id):
+    value = client.get_metric_value(
+        f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}'
+    )
+    if value is None:
+        return 0
+    return int(value)
+
+
+#
+# If you have a large relation, check that the pageserver downloads parts of it as
+# require by queries.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_ondemand_download_large_rel(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_large_rel",
+    )
+
+    ##### First start, insert secret data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable background GC
+            "gc_period": "10 m",
+            "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            "compaction_threshold": "3",
+            "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    # We want to make sure that the data is large enough that the keyspace is partitioned.
+    num_rows = 1000000
+
+    with pg.cursor() as cur:
+        # data loading may take a while, so increase statement timeout
+        cur.execute("SET statement_timeout='300s'")
+        cur.execute(
+            f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g
+        from generate_series(1,{num_rows}) g"""
+        )
+        cur.execute("CREATE INDEX ON tbl (id)")
+        cur.execute("VACUUM tbl")
+
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+
+    # run checkpoint manually to be sure that data landed in remote storage
+    client.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
+    log.info("uploads have finished")
+
+    ##### Stop the first pageserver instance, erase all its data
+    pg.stop()
+    env.pageserver.stop()
+
+    # remove all the layer files
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start()
+
+    pg.start()
+    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+
+    # Probe in the middle of the table. There's a high chance that the beginning
+    # and end of the table was stored together in the same layer files with data
+    # from other tables, and with the entry that stores the size of the
+    # relation, so they are likely already downloaded. But the middle of the
+    # table should not have been needed by anything yet.
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1
+
+    after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    log.info(f"layers downloaded before {before_downloads} and after {after_downloads}")
+    assert after_downloads > before_downloads
+
+
+#
+# If you have a relation with a long history of updates, the pageserver downloads the layer
+# files containing the history as needed by timetravel queries.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_ondemand_download_timetravel(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_timetravel",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # Disable background GC & compaction
+            # We don't want GC, that would break the assertion about num downloads.
+            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{1 * 1024 ** 2}",  # 1 MB
+            "compaction_threshold": "1",
+            "image_creation_threshold": "1",
+            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    lsns = []
+
+    table_len = 10000
+    with pg.cursor() as cur:
+        cur.execute(
+            f"""
+        CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
+        INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len});
+        """
+        )
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+    # run checkpoint manually to be sure that data landed in remote storage
+    client.timeline_checkpoint(tenant_id, timeline_id)
+    lsns.append((0, current_lsn))
+
+    for checkpoint_number in range(1, 20):
+        with pg.cursor() as cur:
+            cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+        lsns.append((checkpoint_number, current_lsn))
+
+        # wait until pageserver receives that data
+        wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+
+        # run checkpoint manually to be sure that data landed in remote storage
+        client.timeline_checkpoint(tenant_id, timeline_id)
+
+    ##### Stop the first pageserver instance, erase all its data
+    env.postgres.stop_all()
+
+    # wait until pageserver has successfully uploaded all the data to remote storage
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
+
+    def get_api_current_physical_size():
+        d = client.timeline_detail(tenant_id, timeline_id)
+        return d["current_physical_size"]
+
+    def get_resident_physical_size():
+        return client.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_resident_physical_size"
+        )
+
+    filled_current_physical = get_api_current_physical_size()
+    log.info(filled_current_physical)
+    filled_size = get_resident_physical_size()
+    log.info(filled_size)
+    assert filled_current_physical == filled_size, "we don't yet do layer eviction"
+
+    env.pageserver.stop()
+
+    # remove all the layer files
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start()
+
+    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+
+    # The current_physical_size reports the sum of layers loaded in the layer
+    # map, regardless of where the layer files are located. So even though we
+    # just removed the local files, they still count towards
+    # current_physical_size because they are loaded as `RemoteLayer`s.
+    assert filled_current_physical == get_api_current_physical_size()
+
+    # Run queries at different points in time
+    num_layers_downloaded = [0]
+    resident_size = [get_resident_physical_size()]
+    for (checkpoint_number, lsn) in lsns:
+        pg_old = env.postgres.create_start(
+            branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn
+        )
+        with pg_old.cursor() as cur:
+            # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000
+            assert (
+                query_scalar(
+                    cur,
+                    f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}",
+                )
+                == 0
+            )
+            assert (
+                query_scalar(
+                    cur,
+                    f"select count(*) from testtab where checkpoint_number={checkpoint_number}",
+                )
+                == table_len
+            )
+
+        after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+        num_layers_downloaded.append(after_downloads)
+        log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}")
+
+        # Check that on each query, we need to download at least one more layer file. However in
+        # practice, thanks to compaction and the fact that some requests need to download
+        # more history, some points-in-time are covered by earlier downloads already. But
+        # in broad strokes, as we query more points-in-time, more layers need to be downloaded.
+        #
+        # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded
+        # more files than we had three iterations ago.
+        log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}")
+        if len(num_layers_downloaded) > 4:
+            assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4]
+
+        # Likewise, assert that the resident_physical_size metric grows as layers are downloaded
+        resident_size.append(get_resident_physical_size())
+        log.info(f"resident_size[-1]={resident_size[-1]}")
+        if len(resident_size) > 4:
+            assert resident_size[-1] > resident_size[len(resident_size) - 4]
+
+        # current_physical_size reports the total size of all layer files, whether
+        # they are present only in the remote storage, only locally, or both.
+        # It should not change.
+        assert filled_current_physical == get_api_current_physical_size()
+
+
+#
+# Ensure that the `download_remote_layers` API works
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_download_remote_layers_api(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_download_remote_layers_api",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # Disable background GC & compaction
+            # We don't want GC, that would break the assertion about num downloads.
+            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{1 * 1024 ** 2}",  # 1 MB
+            "compaction_threshold": "1",
+            "image_creation_threshold": "1",
+            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    table_len = 10000
+    with pg.cursor() as cur:
+        cur.execute(
+            f"""
+        CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
+        INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len});
+        """
+        )
+
+    env.postgres.stop_all()
+
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
+
+    def get_api_current_physical_size():
+        d = client.timeline_detail(tenant_id, timeline_id)
+        return d["current_physical_size"]
+
+    def get_resident_physical_size():
+        return client.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_resident_physical_size"
+        )
+
+    filled_current_physical = get_api_current_physical_size()
+    log.info(filled_current_physical)
+    filled_size = get_resident_physical_size()
+    log.info(filled_size)
+    assert filled_current_physical == filled_size, "we don't yet do layer eviction"
+
+    env.pageserver.stop()
+
+    # remove all the layer files
+    # XXX only delete some of the layer files, to show that it really just downloads all the layers
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    # Shut down safekeepers before starting the pageserver.
+    # If we don't, the tenant's walreceiver handler will trigger the
+    # the logical size computation task, and that downloads layes,
+    # which makes our assertions on size fail.
+    for sk in env.safekeepers:
+        sk.stop(immediate=True)
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
+    env.pageserver.allowed_errors.extend(
+        [
+            f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint",
+            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
+        ]
+    )
+
+    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+
+    ###### Phase 1: exercise download error code path
+    assert (
+        filled_current_physical == get_api_current_physical_size()
+    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
+    post_unlink_size = get_resident_physical_size()
+    log.info(post_unlink_size)
+    assert (
+        post_unlink_size < filled_size
+    ), "we just deleted layers and didn't cause anything to re-download them yet"
+    assert filled_size - post_unlink_size > 5 * (
+        1024**2
+    ), "we may be downloading some layers as part of tenant activation"
+
+    # issue downloads that we know will fail
+    info = client.timeline_download_remote_layers(
+        tenant_id, timeline_id, errors_ok=True, at_least_one_download=False
+    )
+    log.info(f"info={info}")
+    assert info["state"] == "Completed"
+    assert info["total_layer_count"] > 0
+    assert info["successful_download_count"] == 0
+    assert (
+        info["failed_download_count"] > 0
+    )  # can't assert == total_layer_count because attach + tenant status downloads some layers
+    assert (
+        info["total_layer_count"]
+        == info["successful_download_count"] + info["failed_download_count"]
+    )
+    assert get_api_current_physical_size() == filled_current_physical
+    assert (
+        get_resident_physical_size() == post_unlink_size
+    ), "didn't download anything new due to failpoint"
+    # would be nice to assert that the layers in the layer map are still RemoteLayer
+
+    ##### Retry, this time without failpoints
+    client.configure_failpoints(("remote-storage-download-pre-rename", "off"))
+    info = client.timeline_download_remote_layers(tenant_id, timeline_id, errors_ok=False)
+    log.info(f"info={info}")
+
+    assert info["state"] == "Completed"
+    assert info["total_layer_count"] > 0
+    assert info["successful_download_count"] > 0
+    assert info["failed_download_count"] == 0
+    assert (
+        info["total_layer_count"]
+        == info["successful_download_count"] + info["failed_download_count"]
+    )
+
+    refilled_size = get_resident_physical_size()
+    log.info(refilled_size)
+
+    assert filled_size == refilled_size, "we redownloaded all the layers"
+    assert get_api_current_physical_size() == filled_current_physical
+
+    for sk in env.safekeepers:
+        sk.start()
+
+    # ensure that all the data is back
+    pg_old = env.postgres.create_start(branch_name="main")
+    with pg_old.cursor() as cur:
+        assert query_scalar(cur, "select count(*) from testtab") == table_len
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index e48815906b..6388e979e5 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -32,7 +32,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     # Verify that the table is larger than shared_buffers
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )
@@ -115,7 +115,7 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
             # Verify that the table is larger than shared_buffers
             cur.execute(
                 """
-            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
             from pg_settings where name = 'shared_buffers'
             """
             )
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index d8b7256577..fe4fbc0927 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -52,6 +52,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
 
     # run GC
     with env.pageserver.http_client() as pageserver_http:
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         pageserver_http.timeline_compact(env.initial_tenant, timeline)
         # perform aggressive GC. Data still should be kept because of the PITR setting.
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index eab9505fbb..e13ba51f4b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -63,7 +63,11 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx
                         "port": local_vanilla_pg.default_options["port"],
                         "dbname": local_vanilla_pg.default_options["dbname"],
                         "user": pg_user,
-                        "project": "irrelevant",
+                        "aux": {
+                            "project_id": "project",
+                            "endpoint_id": "endpoint",
+                            "branch_id": "branch",
+                        },
                     }
                 },
             }
@@ -71,6 +75,7 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx
 
         log.info("sending session activation message")
         psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info)
+        assert psql.stdout is not None
         out = (await psql.stdout.read()).decode("utf-8").strip()
         assert out == "ok"
 
@@ -122,3 +127,33 @@ def test_auth_errors(static_proxy: NeonProxy):
     # Finally, check that the user can connect
     with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
         pass
+
+
+def test_forward_params_to_client(static_proxy: NeonProxy):
+    # A subset of parameters (GUCs) which postgres
+    # sends to the client during connection setup.
+    # Unfortunately, `GUC_REPORT` can't be queried.
+    # Proxy *should* forward them, otherwise client library
+    # might misbehave (e.g. parse timestamps incorrectly).
+    reported_params_subset = [
+        "client_encoding",
+        "integer_datetimes",
+        "is_superuser",
+        "server_encoding",
+        "server_version",
+        "session_authorization",
+        "standard_conforming_strings",
+    ]
+
+    query = """
+        select name, setting
+        from pg_catalog.pg_settings
+        where name = any(%s)
+    """
+
+    with static_proxy.connect(options="project=irrelevant") as conn:
+        with conn.cursor() as cur:
+            cur.execute(query, (reported_params_subset,))
+            for name, value in cur.fetchall():
+                # Check that proxy has forwarded this parameter.
+                assert conn.get_parameter_status(name) == value
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 1e93958e98..09644eaaa1 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -12,11 +12,9 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     # Override default checkpointer settings to run it more often
     neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
 
-    env = neon_env_builder.init()
+    env = neon_env_builder.init_start()
     env.pageserver.is_testing_enabled_or_skip()
 
-    neon_env_builder.start()
-
     # These warnings are expected, when the pageserver is restarted abruptly
     env.pageserver.allowed_errors.append(".*found future delta layer.*")
     env.pageserver.allowed_errors.append(".*found future image layer.*")
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 7152bc8b6a..82bf741a8f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,11 +2,11 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
 import os
-import re
 import shutil
 import threading
 import time
 from pathlib import Path
+from typing import Dict, List, Tuple
 
 import pytest
 from fixtures.log_helper import log
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PageserverApiException,
     RemoteStorageKind,
-    assert_no_in_progress_downloads_for_tenant,
     available_remote_storages,
     wait_for_last_flush_lsn,
     wait_for_last_record_lsn,
@@ -56,10 +55,15 @@ def test_remote_storage_backup_and_restore(
         test_name="test_remote_storage_backup_and_restore",
     )
 
-    data_id = 1
-    data_secret = "very secret secret"
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
 
-    ##### First start, insert secret data and upload it to the remote storage
+    data_id = 1
+    data = "just some data"
+
+    ##### First start, insert data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
     # FIXME: Is this expected?
@@ -76,6 +80,7 @@ def test_remote_storage_backup_and_restore(
     env.pageserver.allowed_errors.append(
         ".*Cannot attach tenant .*?, local tenant directory already exists.*"
     )
+    env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*")
 
     pageserver_http = env.pageserver.http_client()
     pg = env.postgres.create_start("main")
@@ -87,22 +92,12 @@ def test_remote_storage_backup_and_restore(
 
     checkpoint_numbers = range(1, 3)
 
-    # On the first iteration, exercise retry code path by making the uploads
-    # fail for the first 3 times
-    action = "3*return->off"
-    pageserver_http.configure_failpoints(
-        [
-            ("before-upload-layer", action),
-            ("before-upload-index", action),
-        ]
-    )
-
     for checkpoint_number in checkpoint_numbers:
         with pg.cursor() as cur:
             cur.execute(
                 f"""
-                CREATE TABLE t{checkpoint_number}(id int primary key, secret text);
-                INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}');
+                CREATE TABLE t{checkpoint_number}(id int primary key, data text);
+                INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data}|{checkpoint_number}');
             """
             )
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
@@ -118,6 +113,14 @@ def test_remote_storage_backup_and_restore(
         wait_for_upload(client, tenant_id, timeline_id, current_lsn)
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
     ##### Stop the first pageserver instance, erase all its data
     env.postgres.stop_all()
     env.pageserver.stop()
@@ -129,36 +132,53 @@ def test_remote_storage_backup_and_restore(
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    # Introduce failpoint in download
-    pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return"))
-
+    # Introduce failpoint in list remote timelines code path to make tenant_attach fail.
+    # This is before the failures injected by test_remote_failures, so it's a permanent error.
+    pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
+    env.pageserver.allowed_errors.append(
+        ".*error attaching tenant: storage-sync-list-remote-timelines",
+    )
+    # Attach it. This HTTP request will succeed and launch a
+    # background task to load the tenant. In that background task,
+    # listing the remote timelines will fail because of the failpoint,
+    # and the tenant will be marked as Broken.
     client.tenant_attach(tenant_id)
-
-    # is there a better way to assert that failpoint triggered?
     wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
 
-    # assert cannot attach timeline that is scheduled for download
-    # FIXME implement layer download retries
+    # Ensure that even though the tenant is broken, we can't attach it again.
     with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
         client.tenant_attach(tenant_id)
 
-    tenant_status = client.tenant_status(tenant_id)
-    log.info("Tenant status with active failpoint: %s", tenant_status)
-    # FIXME implement layer download retries
-    # assert tenant_status["has_in_progress_downloads"] is True
-
-    # trigger temporary download files removal
+    # Restart again, this implicitly clears the failpoint.
+    # test_remote_failures=1 remains active, though, as it's in the pageserver config.
+    # This means that any of the remote client operations after restart will exercise the
+    # retry code path.
+    #
+    # The initiated attach operation should survive the restart, and continue from where it was.
     env.pageserver.stop()
+    layer_download_failed_regex = (
+        r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
+    )
+    assert not env.pageserver.log_contains(
+        layer_download_failed_regex
+    ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
     env.pageserver.start()
 
-    # ensure that an initiated attach operation survives pageserver restart
+    # Ensure that the pageserver remembers that the tenant was attaching, by
+    # trying to attach it again. It should fail.
     with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
         client.tenant_attach(tenant_id)
-    log.info("waiting for timeline redownload")
+    log.info("waiting for tenant to become active. this should be quick with on-demand download")
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
     wait_until(
-        number_of_iterations=20,
+        number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=tenant_active,
     )
 
     detail = client.timeline_detail(tenant_id, timeline_id)
@@ -167,14 +187,18 @@ def test_remote_storage_backup_and_restore(
         Lsn(detail["last_record_lsn"]) >= current_lsn
     ), "current db Lsn should should not be less than the one stored on remote storage"
 
+    log.info("select some data, this will cause layers to be downloaded")
     pg = env.postgres.create_start("main")
     with pg.cursor() as cur:
         for checkpoint_number in checkpoint_numbers:
             assert (
-                query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};")
-                == f"{data_secret}|{checkpoint_number}"
+                query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};")
+                == f"{data}|{checkpoint_number}"
             )
 
+    log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
+    assert env.pageserver.log_contains(layer_download_failed_regex)
+
 
 # Exercises the upload queue retry code paths.
 # - Use failpoints to cause all storage ops to fail
@@ -247,14 +271,15 @@ def test_remote_storage_upload_queue_retries(
         wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
-        metrics = client.get_metrics()
-        matches = re.search(
-            f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
-            metrics,
-            re.MULTILINE,
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
         )
-        assert matches
-        return int(matches[1])
+        assert val is not None, "expecting metric to be present"
+        return int(val)
 
     # create some layers & wait for uploads to finish
     overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
@@ -334,7 +359,6 @@ def test_remote_storage_upload_queue_retries(
     def tenant_active():
         all_states = client.tenant_list()
         [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["has_in_progress_downloads"] is False
         assert tenant["state"] == "Active"
 
     wait_until(30, 1, tenant_active)
@@ -345,6 +369,168 @@ def test_remote_storage_upload_queue_retries(
         assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
 
 
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_remote_timeline_client_calls_started_metric(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_timeline_client_metrics",
+    )
+
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": f"{128 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{128 * 1024}",
+            # no PITR horizon, we specify the horizon when we request on-demand GC
+            "pitr_interval": "0s",
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # don't create image layers, that causes just noise
+            "image_creation_threshold": "10000",
+        }
+    )
+
+    client = env.pageserver.http_client()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+
+    def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
+        # create initial set of layers & upload them with failpoints configured
+        pg.safe_psql_many(
+            [
+                f"""
+               INSERT INTO foo (id, val)
+               SELECT g, '{data}'
+               FROM generate_series(1, 10000) g
+               ON CONFLICT (id) DO UPDATE
+               SET val = EXCLUDED.val
+               """,
+                # to ensure that GC can actually remove some layers
+                "VACUUM foo",
+            ]
+        )
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+
+    def get_queued_count(file_kind, op_kind):
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
+        )
+        if val is None:
+            return val
+        return int(val)
+
+    def wait_upload_queue_empty():
+        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
+        wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
+        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+
+    calls_started: Dict[Tuple[str, str], List[int]] = {
+        ("layer", "upload"): [0],
+        ("index", "upload"): [0],
+        ("layer", "delete"): [0],
+    }
+
+    def fetch_calls_started():
+        for (file_kind, op_kind), observations in calls_started.items():
+            val = client.get_remote_timeline_client_metric(
+                "pageserver_remote_timeline_client_calls_started_count",
+                tenant_id,
+                timeline_id,
+                file_kind,
+                op_kind,
+            )
+            assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
+            val = int(val)
+            observations.append(val)
+
+    def ensure_calls_started_grew():
+        for (file_kind, op_kind), observations in calls_started.items():
+            log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}")
+            assert all(
+                x < y for x, y in zip(observations, observations[1:])
+            ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
+
+    def churn(data_pass1, data_pass2):
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_compact(tenant_id, timeline_id)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_compact(tenant_id, timeline_id)
+        gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+    # create some layers & wait for uploads to finish
+    churn("a", "b")
+
+    wait_upload_queue_empty()
+
+    # ensure that we updated the calls_started metric
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+    # more churn to cause more operations
+    churn("c", "d")
+
+    # ensure that the calls_started metric continued to be updated
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+    ### now we exercise the download path
+    calls_started.clear()
+    calls_started.update(
+        {
+            ("index", "download"): [0],
+            ("layer", "download"): [0],
+        }
+    )
+
+    env.pageserver.stop(immediate=True)
+    env.postgres.stop_all()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+    client = env.pageserver.http_client()
+
+    client.tenant_attach(tenant_id)
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
+    wait_until(30, 1, tenant_active)
+
+    log.info("restarting postgres to validate")
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+
+    # ensure that we updated the calls_started download metric
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+
 # Test that we correctly handle timeline with layers stuck in upload queue
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_timeline_deletion_with_files_stuck_in_upload_queue(
@@ -378,14 +564,14 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     client = env.pageserver.http_client()
 
     def get_queued_count(file_kind, op_kind):
-        metrics = client.get_metrics()
-        matches = re.search(
-            f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
-            metrics,
-            re.MULTILINE,
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
         )
-        assert matches
-        return int(matches[1])
+        return int(val) if val is not None else val
 
     pg = env.postgres.create_start("main", tenant_id=tenant_id)
 
@@ -436,8 +622,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
 
     assert not timeline_path.exists()
 
-    # timeline deletion should kill ongoing uploads
-    assert get_queued_count(file_kind="index", op_kind="upload") == 0
+    # timeline deletion should kill ongoing uploads, so, the metric will be gone
+    assert get_queued_count(file_kind="index", op_kind="upload") is None
 
     # timeline deletion should be unblocking checkpoint ops
     checkpoint_thread.join(2.0)
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 6d621fbb77..29cdcb18ce 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -59,7 +59,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 100,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,  # 7 days
                 }.items()
             )
 
@@ -79,7 +79,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 30,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 
@@ -107,7 +107,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 80,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 
@@ -130,7 +130,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 80,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 59811c565c..db5bb679f2 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,9 +1,13 @@
+import asyncio
+import random
 import time
 from threading import Thread
 
+import asyncpg
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
     PageserverApiException,
     PageserverHttpClient,
@@ -12,6 +16,7 @@ from fixtures.neon_fixtures import (
     available_remote_storages,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until,
     wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -24,6 +29,7 @@ def do_gc_target(
     """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
     try:
         log.info("sending gc http request")
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
         pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
     except Exception as e:
         log.error("do_gc failed: %s", e)
@@ -31,6 +37,202 @@ def do_gc_target(
         log.info("gc http thread returning")
 
 
+# Basic detach and re-attach test
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_reattach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_reattach",
+    )
+
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    with pg.cursor() as cur:
+        cur.execute("CREATE TABLE t(key int primary key, value text)")
+        cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # Wait for the all data to be processed by the pageserver and uploaded in remote storage
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
+
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+
+    # Check that we had to retry the downloads
+    assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
+    assert env.pageserver.log_contains(".*download.*failed, will retry.*")
+
+
+num_connections = 10
+num_rows = 100000
+updates_to_perform = 0
+
+updates_started = 0
+updates_finished = 0
+
+
+# Run random UPDATEs on test table. On failure, try again.
+async def update_table(pg_conn: asyncpg.Connection):
+    global updates_started, updates_finished, updates_to_perform
+
+    while updates_started < updates_to_perform or updates_to_perform == 0:
+        updates_started += 1
+        id = random.randrange(1, num_rows)
+
+        # Loop to retry until the UPDATE succeeds
+        while True:
+            try:
+                await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
+                updates_finished += 1
+                if updates_finished % 1000 == 0:
+                    log.info(f"update {updates_finished} / {updates_to_perform}")
+                break
+            except asyncpg.PostgresError as e:
+                # Received error from Postgres. Log it, sleep a little, and continue
+                log.info(f"UPDATE error: {e}")
+                await asyncio.sleep(0.1)
+
+
+async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    global updates_started, updates_finished, updates_to_perform
+
+    # Wait until we have performed some updates
+    wait_until(20, 0.5, lambda: updates_finished > 500)
+
+    log.info("Detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    await asyncio.sleep(1)
+    log.info("Re-attaching tenant")
+    pageserver_http.tenant_attach(tenant_id)
+    log.info("Re-attach finished")
+
+    # Continue with 5000 more updates
+    updates_to_perform = updates_started + 5000
+
+
+# async guts of test_tenant_reattach_while_bysy test
+async def reattach_while_busy(
+    env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId
+):
+    workers = []
+    for worker_id in range(num_connections):
+        pg_conn = await pg.connect_async()
+        workers.append(asyncio.create_task(update_table(pg_conn)))
+
+    workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
+    await asyncio.gather(*workers)
+
+    assert updates_finished == updates_to_perform
+
+
+# Detach and re-attach tenant, while compute is busy running queries.
+#
+# Some of the queries may fail, in the window that the tenant has been
+# detached but not yet re-attached. But Postgres itself should keep
+# running, and when we retry the queries, they should start working
+# after the attach has finished.
+
+# FIXME:
+#
+# This is pretty unstable at the moment. I've seen it fail with a warning like this:
+#
+# AssertionError: assert not ['2023-01-05T13:09:40.708303Z  WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n']
+#
+# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/)
+# I believe that failure happened because there is a race condition
+# between detach and starting remote upload tasks:
+#
+# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown
+#    signal to all in-progress tasks associated with the tenant.
+# 2. Just after shutdown_tasks() has collected the list of tasks,
+#    a new remote-upload task is spawned.
+#
+# See https://github.com/neondatabase/neon/issues/3273
+#
+#
+# I also saw this failure:
+#
+# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy
+#     asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run
+#     return loop.run_until_complete(main)
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete
+#     return future.result()
+# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy
+#     assert updates_finished == updates_to_perform
+# E   assert 5010 == 10010
+# E     +5010
+# E     -10010
+#
+# I don't know what's causing that...
+@pytest.mark.skip(reason="fixme")
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_reattach_while_busy(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_reattach_while_busy",
+    )
+    env = neon_env_builder.init_start()
+
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+    env.pageserver.allowed_errors.append(
+        ".*Tenant .* will not become active\\. Current state: Stopping.*"
+    )
+
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        # Create layers aggressively
+        conf={"checkpoint_distance": "100000"}
+    )
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    cur = pg.connect().cursor()
+
+    cur.execute("CREATE TABLE t(id int primary key, counter int)")
+    cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0")
+
+    # Run the test
+    asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+
+    # Verify table contents
+    assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows
+    assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform
+
+
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
@@ -417,7 +619,7 @@ def test_ignore_while_attaching(
     pageserver_http.tenant_attach(tenant_id)
     # Run ignore on the task, thereby cancelling the attach.
     # XXX This should take priority over attach, i.e., it should cancel the attach task.
-    # But neither the failpoint, nor the proper storage_sync2 download functions,
+    # But neither the failpoint, nor the proper storage_sync download functions,
     # are sensitive to task_mgr::shutdown.
     # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
     # So, for now, effectively, this ignore here will block until attach task completes.
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 081fd0fc2f..1b58937e2a 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -13,12 +13,15 @@ from fixtures.neon_fixtures import (
     PageserverHttpClient,
     PortDistributor,
     Postgres,
-    assert_no_in_progress_downloads_for_tenant,
+    assert_tenant_status,
+    tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until,
+    wait_while,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until
+from fixtures.utils import query_scalar, start_in_background, subprocess_capture
 
 
 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
@@ -406,17 +409,13 @@ def test_tenant_relocation(
             # call to attach timeline to new pageserver
             new_pageserver_http.tenant_attach(tenant_id)
 
-            # check that it shows that download is in progress
+            # wait for tenant to finish attaching
             tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
-            assert tenant_status.get("has_in_progress_downloads"), tenant_status
-
-            # wait until tenant is downloaded
+            assert tenant_status["state"] in ["Attaching", "Active"]
             wait_until(
                 number_of_iterations=10,
                 interval=1,
-                func=lambda: assert_no_in_progress_downloads_for_tenant(
-                    new_pageserver_http, tenant_id
-                ),
+                func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"),
             )
 
             check_timeline_attached(
@@ -459,9 +458,15 @@ def test_tenant_relocation(
 
         # detach tenant from old pageserver before we check
         # that all the data is there to be sure that old pageserver
-        # is no longer involved, and if it is, we will see the errors
+        # is no longer involved, and if it is, we will see the error
         pageserver_http.tenant_detach(tenant_id)
 
+        # Wait a little, so that the detach operation has time to finish.
+        wait_while(
+            number_of_iterations=100,
+            interval=1,
+            func=lambda: tenant_exists(pageserver_http, tenant_id),
+        )
         post_migration_check(pg_main, 500500, old_local_path_main)
         post_migration_check(pg_second, 1001000, old_local_path_second)
 
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index ddae1a67ff..4eba4ce942 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -20,44 +20,48 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
         matching = [t for t in all_states if TenantId(t["id"]) == tenant]
         return get_only_element(matching)["state"]
 
-    def get_metric_value(name):
-        metrics = client.get_metrics()
-        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
-        if len(relevant) == 0:
-            return 0
-        line = get_only_element(relevant)
-        value = line.lstrip(name).strip()
-        return int(value)
-
     def delete_all_timelines(tenant: TenantId):
         timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)]
         for t in timelines:
             client.timeline_delete(tenant, t)
 
+    def assert_active(tenant):
+        assert get_state(tenant) == "Active"
+
     # Create tenant, start compute
     tenant, _ = env.neon_cli.create_tenant()
     env.neon_cli.create_timeline(name, tenant_id=tenant)
     pg = env.postgres.create_start(name, tenant_id=tenant)
+    assert (
+        get_state(tenant) == "Active"
+    ), "Pageserver should activate a tenant and start background jobs if timelines are loaded"
 
     # Stop compute
     pg.stop()
 
-    # Delete all timelines on all tenants
+    # Delete all timelines on all tenants.
+    #
+    # FIXME: we used to check that the background jobs are stopped when all timelines
+    # are removed, but we don't stop them anymore. Not sure if this test still makes sense
+    # or we should just remove it.
     for tenant_info in client.tenant_list():
         tenant_id = TenantId(tenant_info["id"])
         delete_all_timelines(tenant_id)
+        wait_until(10, 0.2, lambda: assert_active(tenant_id))
 
     # Assert that all tasks finish quickly after tenant is detached
-    assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0
+    task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
+    assert task_starts is not None
+    assert int(task_starts) > 0
     client.tenant_detach(tenant)
     client.tenant_detach(env.initial_tenant)
 
     def assert_tasks_finish():
-        tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}')
-        tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}')
-        tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}')
+        tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
+        tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}')
+        tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}')
         log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}")
         assert tasks_started == tasks_ended
-        assert tasks_panicked == 0
+        assert tasks_panicked is None or int(tasks_panicked) == 0
 
     wait_until(10, 0.2, assert_tasks_finish)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 0b20afefc3..9477ae3c25 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -7,7 +7,11 @@ from typing import List
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics
+from fixtures.metrics import (
+    PAGESERVER_PER_TENANT_METRICS,
+    PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
+    parse_metrics,
+)
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
@@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
         )
 
 
-def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "remote_storage_kind",
+    # exercise both the code paths where remote_storage=None and remote_storage=Some(...)
+    [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
+)
+def test_pageserver_metrics_removed_after_detach(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
     """Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
 
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_pageserver_metrics_removed_after_detach",
+    )
+
     neon_env_builder.num_safekeepers = 3
 
     env = neon_env_builder.init_start()
@@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
 
     for tenant in [tenant_1, tenant_2]:
         pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
-        assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS)
+        expected = set(PAGESERVER_PER_TENANT_METRICS)
+        if remote_storage_kind == RemoteStorageKind.NOOP:
+            # if there's no remote storage configured, we don't expose the remote timeline client metrics
+            expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
+        assert pre_detach_samples == expected
 
         env.pageserver.http_client().tenant_detach(tenant)
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index afc413f3e3..6da6a4d446 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -21,9 +21,10 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     Postgres,
     RemoteStorageKind,
-    assert_no_in_progress_downloads_for_tenant,
+    assert_tenant_status,
     available_remote_storages,
     wait_for_last_record_lsn,
+    wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -120,6 +121,11 @@ def test_tenants_attached_after_download(
     data_id = 1
     data_secret = "very secret secret"
 
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
@@ -158,26 +164,19 @@ def test_tenants_attached_after_download(
         wait_for_upload(client, tenant_id, timeline_id, current_lsn)
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
     ##### Stop the pageserver, erase its layer file to force it being downloaded from S3
     env.postgres.stop_all()
 
-    sk_commit_lsns = [
-        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn
-        for sk in env.safekeepers
-    ]
-    log.info("wait for pageserver to process all the WAL")
-    wait_for_last_record_lsn(client, tenant_id, timeline_id, max(sk_commit_lsns))
-    log.info("wait for it to reach remote storage")
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(client, tenant_id, timeline_id, max(sk_commit_lsns))
-    log.info("latest safekeeper_commit_lsn reached remote storage")
-
-    detail_before = client.timeline_detail(
-        tenant_id, timeline_id, include_non_incremental_physical_size=True
-    )
-    assert (
-        detail_before["current_physical_size_non_incremental"]
-        == detail_before["current_physical_size"]
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
     )
 
     env.pageserver.stop()
@@ -193,13 +192,16 @@ def test_tenants_attached_after_download(
     assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}"
 
     ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory
+    # FIXME: just starting the pageserver no longer downloads the
+    # layer files. Do we want to force download, or maybe run some
+    # queries, or is it enough that it starts up without layer files?
     env.pageserver.start()
     client = env.pageserver.http_client()
 
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -211,11 +213,8 @@ def test_tenants_attached_after_download(
         timeline_id
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
-    # Check that the physical size matches after re-downloading
-    detail_after = client.timeline_detail(
-        tenant_id, timeline_id, include_non_incremental_physical_size=True
-    )
-    assert detail_before["current_physical_size"] == detail_after["current_physical_size"]
+    # Check that we had to retry the downloads
+    assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")
 
 
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
@@ -230,7 +229,7 @@ def test_tenant_upgrades_index_json_from_v0(
         "timeline_layers":[
             "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"
         ],
-        "missing_layers":[],
+        "missing_layers":["This should not fail as its not used anymore"],
         "disk_consistent_lsn":"0/16960E8",
         "metadata_bytes":[]
     }"""
@@ -262,7 +261,6 @@ def test_tenant_upgrades_index_json_from_v0(
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
     pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
     env.postgres.stop_all()
     env.pageserver.stop()
 
@@ -275,7 +273,10 @@ def test_tenant_upgrades_index_json_from_v0(
         # keep the deserialized for later inspection
         orig_index_part = json.load(timeline_file)
 
-        v0_index_part = {key: orig_index_part[key] for key in v0_skeleton}
+        v0_index_part = {
+            key: orig_index_part[key]
+            for key in v0_skeleton.keys() - ["missing_layers"]  # pgserver doesn't have it anymore
+        }
 
         timeline_file.seek(0)
         json.dump(v0_index_part, timeline_file)
@@ -287,7 +288,7 @@ def test_tenant_upgrades_index_json_from_v0(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
+        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
     )
 
     pg = env.postgres.create_start("main")
@@ -307,7 +308,7 @@ def test_tenant_upgrades_index_json_from_v0(
     # make sure the file has been upgraded back to how it started
     index_part = local_fs_index_part(env, tenant_id, timeline_id)
     assert index_part["version"] == orig_index_part["version"]
-    assert index_part["missing_layers"] == orig_index_part["missing_layers"]
+    assert "missing_layers" not in index_part.keys()
 
     # expect one more layer because of the forced checkpoint
     assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1
@@ -394,7 +395,7 @@ def test_tenant_ignores_backup_file(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
+        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
     )
 
     pg = env.postgres.create_start("main")
@@ -474,14 +475,15 @@ def test_tenant_redownloads_truncated_file_on_startup(
     index_part = local_fs_index_part(env, tenant_id, timeline_id)
     assert index_part["layer_metadata"][path.name]["file_size"] == expected_size
 
-    ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory
+    ## Start the pageserver. It will notice that the file size doesn't match, and
+    ## rename away the local file. It will be re-downloaded when it's needed.
     env.pageserver.start()
     client = env.pageserver.http_client()
 
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -493,6 +495,10 @@ def test_tenant_redownloads_truncated_file_on_startup(
         timeline_id
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
+    # Request non-incremental logical size. Calculating it needs the layer file that
+    # we corrupted, forcing it to be redownloaded.
+    client.timeline_detail(tenant_id, timeline_id, include_non_incremental_logical_size=True)
+
     assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"
 
     # the remote side of local_layer_truncated
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index cef1f365cd..3b41cc5c90 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,22 +1,28 @@
 import math
+import queue
 import random
 import re
+import threading
 import time
 from contextlib import closing
 from pathlib import Path
 
 import psycopg2.errors
 import psycopg2.extras
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    PageserverApiException,
     PageserverHttpClient,
     PgBin,
     PortDistributor,
     Postgres,
     VanillaPostgres,
+    assert_tenant_status,
     wait_for_last_flush_lsn,
+    wait_until,
 )
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import get_timeline_dir_size
@@ -213,6 +219,89 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value"
 
 
+@pytest.mark.parametrize("deletion_method", ["tenant_detach", "timeline_delete"])
+def test_timeline_initial_logical_size_calculation_cancellation(
+    neon_env_builder: NeonEnvBuilder, deletion_method: str
+):
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    # load in some data
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    pg.safe_psql_many(
+        [
+            "CREATE TABLE foo (x INTEGER)",
+            "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g",
+        ]
+    )
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pg.stop()
+
+    # restart with failpoint inside initial size calculation task
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
+    wait_until(30, 1, tenant_active)
+
+    # kick off initial size calculation task (the response we get here is the estimated size)
+    def assert_size_calculation_not_done():
+        details = client.timeline_detail(
+            tenant_id, timeline_id, include_non_incremental_logical_size=True
+        )
+        assert details["current_logical_size"] != details["current_logical_size_non_incremental"]
+
+    assert_size_calculation_not_done()
+    # ensure we're really stuck
+    time.sleep(5)
+    assert_size_calculation_not_done()
+
+    log.info(
+        f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
+    )
+    delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
+
+    def delete_timeline_thread_fn():
+        try:
+            if deletion_method == "tenant_detach":
+                client.tenant_detach(tenant_id)
+            elif deletion_method == "timeline_delete":
+                client.timeline_delete(tenant_id, timeline_id)
+            delete_timeline_success.put(True)
+        except PageserverApiException:
+            delete_timeline_success.put(False)
+            raise
+
+    delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn)
+    delete_timeline_thread.start()
+    # give it some time to settle in the state where it waits for size computation task
+    time.sleep(5)
+    if not delete_timeline_success.empty():
+        assert (
+            False
+        ), f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}"
+
+    log.info(
+        "resume the size calculation. The failpoint checks that the timeline directory still exists."
+    )
+    client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return"))
+    client.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+    log.info("wait for delete timeline thread to finish and assert that it succeeded")
+    assert delete_timeline_success.get()
+
+    # if the implementation is incorrect, the teardown would complain about an error log
+    # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
+
+
 def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
     env = neon_simple_env
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init")
@@ -233,7 +322,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
     env.pageserver.stop()
     env.pageserver.start()
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    # Wait for the tenant to be loaded
+    client = env.pageserver.http_client()
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"),
+    )
+
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
@@ -254,7 +353,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
     wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
@@ -289,7 +390,9 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
@@ -326,10 +429,11 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
 
     wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
-
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 # The timeline logical and physical sizes are also exposed as prometheus metrics.
@@ -362,7 +466,7 @@ def test_timeline_size_metrics(
     # get the metrics and parse the metric for the current timeline's physical size
     metrics = env.pageserver.http_client().get_metrics()
     matches = re.search(
-        f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
+        f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
         metrics,
         re.MULTILINE,
     )
@@ -421,11 +525,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
 
     tenant, timeline = env.neon_cli.create_tenant()
 
-    def get_timeline_physical_size(timeline: TimelineId):
-        res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True)
-        return res["current_physical_size_non_incremental"]
+    def get_timeline_resident_physical_size(timeline: TimelineId):
+        sizes = get_physical_size_values(env, tenant, timeline)
+        assert_physical_size_invariants(sizes)
+        return sizes.prometheus_resident_physical
 
-    timeline_total_size = get_timeline_physical_size(timeline)
+    timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
     for i in range(10):
         n_rows = random.randint(100, 1000)
 
@@ -442,22 +547,54 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
         wait_for_last_flush_lsn(env, pg, tenant, timeline)
         pageserver_http.timeline_checkpoint(tenant, timeline)
 
-        timeline_total_size += get_timeline_physical_size(timeline)
+        timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
 
         pg.stop()
 
-    tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"])
-    assert tenant_physical_size == timeline_total_size
+    # ensure that tenant_status current_physical size reports sum of timeline current_physical_size
+    tenant_current_physical_size = int(
+        client.tenant_status(tenant_id=tenant)["current_physical_size"]
+    )
+    assert tenant_current_physical_size == sum(
+        [tl["current_physical_size"] for tl in client.timeline_list(tenant_id=tenant)]
+    )
+    # since we don't do layer eviction, current_physical_size is identical to resident physical size
+    assert timeline_total_resident_physical_size == tenant_current_physical_size
 
 
-def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
-    """Check the current physical size returned from timeline API
-    matches the total physical size of the timeline on disk"""
+class TimelinePhysicalSizeValues:
+    api_current_physical: int
+    prometheus_resident_physical: int
+    python_timelinedir_layerfiles_physical: int
+
+
+def get_physical_size_values(
+    env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId
+) -> TimelinePhysicalSizeValues:
+    res = TimelinePhysicalSizeValues()
+
     client = env.pageserver.http_client()
-    res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True)
+
+    res.prometheus_resident_physical = client.get_timeline_metric(
+        tenant_id, timeline_id, "pageserver_resident_physical_size"
+    )
+
+    detail = client.timeline_detail(
+        tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
+    )
+    res.api_current_physical = detail["current_physical_size"]
+
     timeline_path = env.timeline_dir(tenant_id, timeline_id)
-    assert res["current_physical_size"] == res["current_physical_size_non_incremental"]
-    assert res["current_physical_size"] == get_timeline_dir_size(timeline_path)
+    res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path)
+
+    return res
+
+
+def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
+    # resident phyiscal size is defined as
+    assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
+    # we don't do layer eviction, so, all layers are resident
+    assert sizes.api_current_physical == sizes.prometheus_resident_physical
 
 
 # Timeline logical size initialization is an asynchronous background task that runs once,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3b72aba422..72d27c3aba 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -585,17 +585,23 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
         if elapsed > wait_lsn_timeout:
             raise RuntimeError("Timed out waiting for WAL redo")
 
-        pageserver_lsn = Lsn(
-            env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"]
-        )
-        lag = last_lsn - pageserver_lsn
+        tenant_status = ps_cli.tenant_status(tenant_id)
+        if tenant_status["state"] == "Loading":
+            log.debug(f"Tenant {tenant_id} is still loading, retrying")
+        else:
+            pageserver_lsn = Lsn(
+                env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[
+                    "last_record_lsn"
+                ]
+            )
+            lag = last_lsn - pageserver_lsn
 
-        if time.time() > last_debug_print + 10 or lag <= 0:
-            last_debug_print = time.time()
-            log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb")
+            if time.time() > last_debug_print + 10 or lag <= 0:
+                last_debug_print = time.time()
+                log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb")
 
-        if lag <= 0:
-            break
+                if lag <= 0:
+                    break
 
         time.sleep(1)
 
@@ -883,9 +889,12 @@ class SafekeeperEnv:
             raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}")
 
     def get_safekeeper_connstrs(self):
+        assert self.safekeepers is not None, "safekeepers are not initialized"
         return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers])
 
     def create_postgres(self):
+        assert self.tenant_id is not None, "tenant_id is not initialized"
+        assert self.timeline_id is not None, "tenant_id is not initialized"
         pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata")
         pg = ProposerPostgres(
             pgdata_dir,
@@ -1096,7 +1105,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     env.pageserver.allowed_errors.extend(
         [
             ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
-            ".*end streaming to Some.*",
         ]
     )
 
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index aaaa8893a5..24045e2eb7 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -65,7 +65,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
     # Verify that the table is larger than shared_buffers
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index de9a26513d..3aff839b81 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -13,9 +13,9 @@ publish = false
 
 ### BEGIN HAKARI SECTION
 [dependencies]
-ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
+chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] }
 clap = { version = "4", features = ["color", "derive", "error-context", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] }
 either = { version = "1", features = ["use_std"] }
@@ -36,12 +36,11 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
+serde_json = { version = "1", features = ["raw_value", "std"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-stable_deref_trait = { version = "1", features = ["alloc", "std"] }
-tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
+tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
 tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
 tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
 tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] }
@@ -49,7 +48,6 @@ tracing-core = { version = "0.1", features = ["once_cell", "std"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]
-ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
 either = { version = "1", features = ["use_std"] }