replace take_mut usage with std::mem::replace + assignment

introduce the Broken state, using take_mut::take_with_recover
clarifications around the QueueUninitialized error
2026-05-25 09:00:37 +00:00 · 2023-05-04 18:12:12 +02:00 · 2023-05-04 18:04:45 +02:00 · 2023-05-04 17:08:02 +02:00 · 2023-05-04 12:03:46 +02:00 · 2023-05-03 19:46:55 +02:00
144 changed files with 6355 additions and 1339 deletions
--- a/.github/ansible/prod.us-east-1.hosts.yaml
+++ b/.github/ansible/prod.us-east-1.hosts.yaml
@@ -0,0 +1,50 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-us-east-1
+    bucket_region: us-east-1
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
+    broker_endpoint: http://storage-broker-lb.theta.us-east-1.internal.aws.neon.tech:50051
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
+      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "10m"
+          threshold: &default_eviction_threshold "24h"
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-1
+    console_region_id: aws-us-east-1
+    sentry_environment: production
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-1.aws.neon.tech:
+          ansible_host: i-085222088b0d2e0c7
+        pageserver-1.us-east-1.aws.neon.tech:
+          ansible_host: i-0969d4f684d23a21e
+        pageserver-2.us-east-1.aws.neon.tech:
+          ansible_host: i-05dee87895da58dad
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-1.aws.neon.tech:
+          ansible_host: i-04ce739e88793d864
+        safekeeper-1.us-east-1.aws.neon.tech:
+          ansible_host: i-0e9e6c9227fb81410
+        safekeeper-2.us-east-1.aws.neon.tech:
+          ansible_host: i-072f4dd86a327d52f
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -41,6 +41,14 @@ storage:
          ansible_host: i-051642d372c0a4f32
        pageserver-3.us-west-2.aws.neon.tech:
          ansible_host: i-00c3844beb9ad1c6b
+        pageserver-4.us-west-2.aws.neon.tech:
+          ansible_host: i-013263dd1c239adcc
+        pageserver-5.us-west-2.aws.neon.tech:
+          ansible_host: i-00ca6417c7bf96820
+        pageserver-6.us-west-2.aws.neon.tech:
+          ansible_host: i-01cdf7d2bc1433b6a
+        pageserver-7.us-west-2.aws.neon.tech:
+          ansible_host: i-02eec9b40617db5bc

    safekeepers:
      hosts:
@@ -50,4 +58,15 @@ storage:
          ansible_host: i-074682f9d3c712e7c
        safekeeper-2.us-west-2.aws.neon.tech:
          ansible_host: i-042b7efb1729d7966
-
+        safekeeper-3.us-west-2.aws.neon.tech:
+          ansible_host: i-089f6b9ef426dff76
+        safekeeper-4.us-west-2.aws.neon.tech:
+          ansible_host: i-0fe6bf912c4710c82
+        safekeeper-5.us-west-2.aws.neon.tech:
+          ansible_host: i-0a83c1c46d2b4e409
+        safekeeper-6.us-west-2.aws.neon.tech:
+          ansible_host: i-0fef5317b8fdc9f8d
+        safekeeper-7.us-west-2.aws.neon.tech:
+          ansible_host: i-0be739190d4289bf9
+        safekeeper-8.us-west-2.aws.neon.tech:
+          ansible_host: i-00e851803669e5cfe                    
--- a/.github/ansible/staging.eu-central-1.hosts.yaml
+++ b/.github/ansible/staging.eu-central-1.hosts.yaml
@@ -0,0 +1,47 @@
+storage:
+  vars:
+    bucket_name: neon-dev-storage-eu-central-1
+    bucket_region: eu-central-1
+    # We only register/update storage in one preview console and manually copy to other instances
+    console_mgmt_base_url: http://neon-internal-api.helium.aws.neon.build
+    broker_endpoint: http://storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build:50051
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://neon-internal-api.helium.aws.neon.build/billing/api/v1/usage_events
+      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: &default_eviction_threshold "20m"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-central-1
+    ansible_aws_ssm_bucket_name: neon-dev-storage-eu-central-1
+    console_region_id: aws-eu-central-1
+    sentry_environment: staging
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-central-1.aws.neon.build:
+          ansible_host: i-011f93ec26cfba2d4
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-central-1.aws.neon.build:
+          ansible_host: i-0ff026d27babf8ddd
+        safekeeper-1.eu-central-1.aws.neon.build:
+          ansible_host: i-03983a49ee54725d9
+        safekeeper-2.eu-central-1.aws.neon.build:
+          ansible_host: i-0bd025ecdb61b0db3
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -35,6 +35,8 @@ storage:
      hosts:
        pageserver-0.eu-west-1.aws.neon.build:
          ansible_host: i-01d496c5041c7f34c
+        pageserver-1.eu-west-1.aws.neon.build:
+          ansible_host: i-0e8013e239ce3928c

    safekeepers:
      hosts:
@@ -44,3 +46,15 @@ storage:
          ansible_host: i-06969ee1bf2958bfc
        safekeeper-2.eu-west-1.aws.neon.build:
          ansible_host: i-087892e9625984a0b
+        safekeeper-3.eu-west-1.aws.neon.build:
+          ansible_host: i-0a6f91660e99e8891
+        safekeeper-4.eu-west-1.aws.neon.build:
+          ansible_host: i-0012e309e28e7c249
+        safekeeper-5.eu-west-1.aws.neon.build:
+          ansible_host: i-085a2b1193287b32e
+        safekeeper-6.eu-west-1.aws.neon.build:
+          ansible_host: i-0c713248465ed0fbd
+        safekeeper-7.eu-west-1.aws.neon.build:
+          ansible_host: i-02ad231aed2a80b7a
+        safekeeper-8.eu-west-1.aws.neon.build:
+          ansible_host: i-0dbbd8ffef66efda8
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -48,9 +48,9 @@ storage:
      hosts:
        safekeeper-0.us-east-2.aws.neon.build:
          ansible_host: i-027662bd552bf5db0
-        safekeeper-1.us-east-2.aws.neon.build:
-          ansible_host: i-0171efc3604a7b907
        safekeeper-2.us-east-2.aws.neon.build:
          ansible_host: i-0de0b03a51676a6ce
+        safekeeper-3.us-east-2.aws.neon.build:
+          ansible_host: i-05f8ba2cda243bd18
        safekeeper-99.us-east-2.aws.neon.build:
          ansible_host: i-0d61b6a2ea32028d5
--- a/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
@@ -0,0 +1,52 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
+
+ingress:
+  enabled: false
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
+settings:
+  sentryEnvironment: "staging"
--- a/.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml
+++ b/.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build"
+
+settings:
+  domain: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build"
+  sentryEnvironment: "staging"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -23,6 +23,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.eu-west-1.aws.neon.build"
+  otelExporterOtlpEndpoint: "https://otel-collector.zeta.eu-west-1.internal.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
--- a/.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build"
+
+settings:
+  domain: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build"
+  sentryEnvironment: "staging"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -9,6 +9,7 @@ settings:
  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
  uri: "https://console.stage.neon.tech/psql_session/"
  domain: "pg.neon.build"
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
  sentryEnvironment: "staging"
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
  metricCollectionInterval: "1min"
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -24,6 +24,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.cloud.stage.neon.tech"
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -25,6 +25,7 @@ settings:
  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.us-east-2.aws.neon.build"
  extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
--- a/.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.beta.us-east-2.internal.aws.neon.build"
+
+settings:
+  domain: "*.snirouter.beta.us-east-2.internal.aws.neon.build"
+  sentryEnvironment: "staging"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/preview-template.neon-proxy-scram.yaml
+++ b/.github/helm-values/preview-template.neon-proxy-scram.yaml
@@ -0,0 +1,67 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/management/api/v2"
+  domain: "*.cloud.${PREVIEW_NAME}.aws.neon.build"
+  sentryEnvironment: "staging"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/billing/api/v1/usage_events"
+  metricCollectionInterval: "1min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  neon_service: proxy-scram
+  neon_env: test
+  neon_region: ${PREVIEW_NAME}.eu-central-1
+
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: cloud.${PREVIEW_NAME}.aws.neon.build
+  httpsPort: 443
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
@@ -0,0 +1,69 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+# Delay the kill signal by 5 minutes (5 * 60)
+# The pod(s) will stay in Terminating, keeps the existing connections
+# but doesn't receive new ones
+containerLifecycle:
+  preStop:
+    exec:
+      command: ["/bin/sh", "-c", "sleep 300"]
+terminationGracePeriodSeconds: 604800
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
+  domain: "*.us-east-1.aws.neon.tech"
+  # *.us-east-1.retooldb.com hasn't been delegated yet.
+  extraDomains: ["*.us-east-1.postgres.vercel-storage.com"]
+  sentryEnvironment: "production"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
+  metricCollectionInterval: "10min"
+
+podLabels:
+  neon_service: proxy-scram
+  neon_env: prod
+  neon_region: us-east-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-1.aws.neon.tech
+  httpsPort: 443
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
--- a/.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml
@@ -0,0 +1,52 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.theta.us-east-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
+
+ingress:
+  enabled: false
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
+settings:
+  sentryEnvironment: "production"
--- a/.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml
+++ b/.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.theta.us-east-1.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.theta.us-east-1.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.delta.us-east-2.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.delta.us-east-2.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.eta.us-west-2.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.eta.us-west-2.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -111,8 +111,21 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

-      - name: Run cargo clippy
-        run: ./run_clippy.sh
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
@@ -405,10 +418,7 @@ jobs:
      - uses: actions/github-script@v6
        if: >
          !cancelled() &&
-          github.event_name == 'pull_request' && (
-            steps.create-allure-report-debug.outputs.report-url ||
-            steps.create-allure-report-release.outputs.report-url
-          )
+          github.event_name == 'pull_request'
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
@@ -541,7 +551,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ push-docker-hub, tag ]
+    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -584,8 +594,7 @@ jobs:
  neon-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
-    # https://github.com/GoogleContainerTools/kaniko/issues/2005
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
        shell: sh -eu {0}
@@ -597,11 +606,32 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

      - name: Kaniko build neon
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}

      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
@@ -652,7 +682,7 @@ jobs:
  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
        shell: sh -eu {0}
@@ -661,18 +691,41 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko

-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

      - name: Kaniko build compute tools
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --dockerfile Dockerfile.compute-tools
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}

+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

  compute-node-image:
    runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    needs: [ tag ]
    strategy:
      fail-fast: false
@@ -689,12 +742,36 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

      - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg PG_VERSION=${{ matrix.version }}
+                           --dockerfile Dockerfile.compute-node
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

@@ -786,13 +863,11 @@ jobs:
    runs-on: [ self-hosted, gen3, small ]
    needs: [ tag, test-images, vm-compute-node-image ]
    container: golang:1.19-bullseye
-    if: github.event_name != 'workflow_dispatch'
+    # Don't add if-condition here.
+    # The job should always be run because we have dependant other jobs that shouldn't be skipped

    steps:
      - name: Install Crane & ECR helper
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
        run: |
          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
@@ -802,10 +877,15 @@ jobs:
          mkdir /github/home/.docker/
          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json

+      - name: Copy vm-compute-node images to Docker Hub
+        run: |
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+
      - name: Add latest tag to images
        if: |
          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
        run: |
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -814,50 +894,10 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest

-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
-
-  push-docker-hub:
-    runs-on: [ self-hosted, dev, x64 ]
-    needs: [ promote-images, tag ]
-    container: golang:1.19-bullseye
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Pull neon image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon
-
-      - name: Pull compute tools image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
-
-      - name: Pull compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
-
-      - name: Pull vm compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-
-      - name: Pull compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
-
-      - name: Pull vm compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-
-      - name: Pull rust image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
-
      - name: Push images to production ECR
        if: |
          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -872,28 +912,12 @@ jobs:
          echo "" > /github/home/.docker/config.json
          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io

-      - name: Push neon image to Docker Hub
-        run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      - name: Push vm-compute-node to Docker Hub
+        run: |
+          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}

-      - name: Push compute tools image to Docker Hub
-        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v14 image to Docker Hub
-        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v14 image to Docker Hub
-        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v15 image to Docker Hub
-        run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v15 image to Docker Hub
-        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push rust image to Docker Hub
-        run: crane push rust neondatabase/rust:pinned
-
-      - name: Add latest tag to images in Docker Hub
+      - name: Push latest tags to Docker Hub
        if: |
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
@@ -913,7 +937,7 @@ jobs:
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
      github.event_name != 'workflow_dispatch'
@@ -947,7 +971,7 @@ jobs:
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
@@ -984,7 +1008,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
    steps:
      - name: Promote compatibility snapshot for the release
--- a/.github/workflows/deploy-dev.yml
+++ b/.github/workflows/deploy-dev.yml
@@ -27,6 +27,11 @@ on:
        required: true
        type: boolean
        default: true
+      deployPgSniRouter:
+        description: 'Deploy pg-sni-router'
+        required: true
+        type: boolean
+        default: true

 env:
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
@@ -48,7 +53,8 @@ jobs:
        shell: bash
    strategy:
      matrix:
-        target_region: [ eu-west-1, us-east-2 ]
+        # TODO(sergey): Fix storage deploy in eu-central-1
+        target_region: [ eu-west-1, us-east-2]
    environment:
      name: dev-${{ matrix.target_region }}
    steps:
@@ -133,6 +139,53 @@ jobs:
  
      - name: Cleanup helm folder
        run: rm -rf ~/.cache
+
+  deploy-preview-proxy-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  eu-central-1
+            target_cluster: dev-eu-central-1-alpha
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Re-deploy preview proxies
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          for PREVIEW_NAME in helium argon krypton xenon radon oganesson hydrogen nitrogen oxygen fluorine chlorine; do
+            export PREVIEW_NAME
+            envsubst <.github/helm-values/preview-template.neon-proxy-scram.yaml >preview-${PREVIEW_NAME}.neon-proxy-scram.yaml
+            helm upgrade neon-proxy-scram-${PREVIEW_NAME} neondatabase/neon-proxy --namespace neon-proxy-${PREVIEW_NAME} --create-namespace --install --atomic -f preview-${PREVIEW_NAME}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          done
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
  
  deploy-storage-broker-new:
    runs-on: [ self-hosted, gen3, small ]
@@ -148,6 +201,8 @@ jobs:
            target_cluster: dev-us-east-2-beta
          - target_region:  eu-west-1
            target_cluster: dev-eu-west-1-zeta
+          - target_region:  eu-central-1
+            target_cluster: dev-eu-central-1-alpha
    environment:
      name: dev-${{ matrix.target_region }}
    steps:
@@ -177,3 +232,49 @@ jobs:
  
      - name: Cleanup helm folder
        run: rm -rf ~/.cache
+
+  deploy-pg-sni-router:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployPgSniRouter
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+          - target_region:  eu-central-1
+            target_cluster: dev-eu-central-1-alpha
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy pg-sni-router
+        run:
+          helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -27,6 +27,11 @@ on:
        required: true
        type: boolean
        default: true
+      deployPgSniRouter:
+        description: 'Deploy pg-sni-router'
+        required: true
+        type: boolean
+        default: true
      disclamerAcknowledged:
        description: 'I confirm that there is an emergency and I can not use regular release workflow'
        required: true
@@ -49,7 +54,7 @@ jobs:
        shell: bash
    strategy:
      matrix:
-        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1, us-east-1 ]
    environment:
      name: prod-${{ matrix.target_region }}
    steps:
@@ -97,6 +102,10 @@ jobs:
            target_cluster: prod-ap-southeast-1-epsilon
            deploy_link_proxy: false
            deploy_legacy_scram_proxy: false
+          - target_region: us-east-1
+            target_cluster: prod-us-east-1-theta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
    environment:
      name: prod-${{ matrix.target_region }}
    steps:
@@ -147,6 +156,8 @@ jobs:
            target_cluster: prod-eu-central-1-gamma
          - target_region: ap-southeast-1
            target_cluster: prod-ap-southeast-1-epsilon
+          - target_region: us-east-1
+            target_cluster: prod-us-east-1-theta
    environment:
      name: prod-${{ matrix.target_region }}
    steps:
@@ -165,3 +176,42 @@ jobs:
      - name: Deploy storage-broker
        run:
          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+  deploy-pg-sni-router:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployPgSniRouter && inputs.disclamerAcknowledged
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+          - target_region: us-east-1
+            target_cluster: prod-us-east-1-theta
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy pg-sni-router
+        run:
+          helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s
--- a/.neon_clippy_args
+++ b/.neon_clippy_args
@@ -0,0 +1,4 @@
+# * `-A unknown_lints` – do not warn about unknown lint suppressions
+#                        that people with newer toolchains might use
+# * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
+export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1574,6 +1574,21 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"

+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -1756,9 +1771,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"

 [[package]]
 name = "h2"
-version = "0.3.17"
+version = "0.3.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "66b91535aa35fea1523ad1b86cb6b53c28e0ae566ba4a460f4457e936cad7c6f"
+checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21"
 dependencies = [
 "bytes",
 "fnv",
@@ -2361,6 +2376,24 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "native-tls"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+dependencies = [
+ "lazy_static",
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "nix"
 version = "0.26.2"
@@ -2483,12 +2516,50 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"

+[[package]]
+name = "openssl"
+version = "0.10.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.15",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

+[[package]]
+name = "openssl-sys"
+version = "0.9.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "opentelemetry"
 version = "0.18.0"
@@ -2681,6 +2752,7 @@ dependencies = [
 "tenant_size_model",
 "thiserror",
 "tokio",
+ "tokio-io-timeout",
 "tokio-postgres",
 "tokio-tar",
 "tokio-util",
@@ -2815,6 +2887,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"

+[[package]]
+name = "pkg-config"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+
 [[package]]
 name = "plotters"
 version = "0.3.4"
@@ -2846,7 +2924,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2856,10 +2934,21 @@ dependencies = [
 "tokio-postgres",
 ]

+[[package]]
+name = "postgres-native-tls"
+version = "0.5.0"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+dependencies = [
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2877,7 +2966,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2958,7 +3047,6 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol",
 "rand",
- "serde",
 "thiserror",
 "tokio",
 "tracing",
@@ -3109,10 +3197,12 @@ dependencies = [
 "itertools",
 "md5",
 "metrics",
+ "native-tls",
 "once_cell",
 "opentelemetry",
 "parking_lot",
 "pin-project-lite",
+ "postgres-native-tls",
 "postgres_backend",
 "pq_proto",
 "prometheus",
@@ -3567,6 +3657,7 @@ dependencies = [
 "const_format",
 "crc32c",
 "fs2",
+ "futures",
 "git-version",
 "hex",
 "humantime",
@@ -3581,7 +3672,9 @@ dependencies = [
 "pq_proto",
 "regex",
 "remote_storage",
+ "reqwest",
 "safekeeper_api",
+ "scopeguard",
 "serde",
 "serde_json",
 "serde_with",
@@ -3868,8 +3961,7 @@ dependencies = [
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
+source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
 dependencies = [
 "lazy_static",
 ]
@@ -4319,10 +4411,20 @@ dependencies = [
 "syn 2.0.15",
 ]

+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -4629,6 +4731,16 @@ dependencies = [
 "valuable",
 ]

+[[package]]
+name = "tracing-error"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e"
+dependencies = [
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-futures"
 version = "0.2.5"
@@ -4854,6 +4966,7 @@ dependencies = [
 "bincode",
 "byteorder",
 "bytes",
+ "chrono",
 "criterion",
 "futures",
 "heapless",
@@ -4865,6 +4978,7 @@ dependencies = [
 "nix",
 "once_cell",
 "pin-project-lite",
+ "pq_proto",
 "rand",
 "regex",
 "routerify",
@@ -4879,6 +4993,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tracing",
+ "tracing-error",
 "tracing-subscriber",
 "url",
 "uuid",
@@ -4901,6 +5016,12 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -5279,13 +5400,11 @@ name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "byteorder",
 "bytes",
 "chrono",
 "clap 4.2.2",
 "clap_builder",
 "crossbeam-utils",
- "digest",
 "either",
 "fail",
 "futures",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,6 +62,7 @@ jsonwebtoken = "8"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
+native-tls = "0.2"
 nix = "0.26"
 notify = "5.0.0"
 num_cpus = "1.15"
@@ -110,6 +111,7 @@ toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
+tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2"
@@ -123,10 +125,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
@@ -158,10 +161,16 @@ rstest = "0.17"
 tempfile = "3.4"
 tonic-build = "0.9"

+[patch.crates-io]
+
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-[patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+
+# Changes the MAX_THREADS limit from 4096 to 32768.
+# This is a temporary workaround for using tracing from many threads in safekeepers code,
+# until async safekeepers patch is merged to the main.
+sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }

 ################# Binary contents sections

--- a/11
+++ b/11
@@ -44,7 +44,15 @@ COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin storage_broker --bin proxy --locked --release \
+    && mold -run cargo build  \
+      --bin pg_sni_router  \
+      --bin pageserver  \
+      --bin pageserver_binutils  \
+      --bin draw_timeline_dir \
+      --bin safekeeper  \
+      --bin storage_broker  \
+      --bin proxy  \
+      --locked --release \
    && cachepot -s

 # Build final image
@@ -63,6 +71,7 @@ RUN set -e \
    && useradd -d /data neon \
    && chown -R neon:neon /data

+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router       /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
--- a/Dockerfile.vm-compute-node
+++ b/Dockerfile.vm-compute-node
@@ -54,7 +54,7 @@ RUN set -e \

 RUN set -e \
 	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
-	&& CONNSTR="dbname=neondb user=cloud_admin sslmode=disable" \
+	&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
 	&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
 	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -73,7 +73,7 @@ fn main() -> Result<()> {
    // Try to use just 'postgres' if no path is provided
    let pgbin = matches.get_one::<String>("pgbin").unwrap();

-    let mut spec = None;
+    let spec;
    let mut live_config_allowed = false;
    match spec_json {
        // First, try to get cluster spec from the cli argument
@@ -89,9 +89,13 @@ fn main() -> Result<()> {
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
-                    if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
-                        spec = Some(s);
-                    }
+                    spec = match get_spec_from_control_plane(cp_base, id) {
+                        Ok(s) => s,
+                        Err(e) => {
+                            error!("cannot get response from control plane: {}", e);
+                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
+                        }
+                    };
                } else {
                    panic!("must specify both --control-plane-uri and --compute-id or none");
                }
@@ -114,7 +118,6 @@ fn main() -> Result<()> {
        spec_set = false;
    }
    let compute_node = ComputeNode {
-        start_time: Utc::now(),
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
@@ -147,6 +150,17 @@ fn main() -> Result<()> {
    let mut state = compute.state.lock().unwrap();
    let pspec = state.pspec.as_ref().expect("spec must be set");
    let startup_tracing_context = pspec.spec.startup_tracing_context.clone();
+
+    // Record for how long we slept waiting for the spec.
+    state.metrics.wait_for_spec_ms = Utc::now()
+        .signed_duration_since(state.start_time)
+        .to_std()
+        .unwrap()
+        .as_millis() as u64;
+    // Reset start time to the actual start of the configuration, so that
+    // total startup time was properly measured at the end.
+    state.start_time = Utc::now();
+
    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
    drop(state);
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -38,7 +38,6 @@ use crate::spec::*;

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
-    pub start_time: DateTime<Utc>,
    // Url type maintains proper escaping
    pub connstr: url::Url,
    pub pgdata: String,
@@ -66,6 +65,7 @@ pub struct ComputeNode {

 #[derive(Clone, Debug)]
 pub struct ComputeState {
+    pub start_time: DateTime<Utc>,
    pub status: ComputeStatus,
    /// Timestamp of the last Postgres activity
    pub last_active: DateTime<Utc>,
@@ -77,6 +77,7 @@ pub struct ComputeState {
 impl ComputeState {
    pub fn new() -> Self {
        Self {
+            start_time: Utc::now(),
            status: ComputeStatus::Empty,
            last_active: Utc::now(),
            error: None,
@@ -248,18 +249,63 @@ impl ComputeNode {
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip(self, compute_state))]
    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+        #[derive(Clone)]
+        enum Replication {
+            Primary,
+            Static { lsn: Lsn },
+            HotStandby,
+        }
+
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

+        let hot_replica = if let Some(option) = spec.cluster.settings.find_ref("hot_standby") {
+            if let Some(value) = &option.value {
+                anyhow::ensure!(option.vartype == "bool");
+                matches!(value.as_str(), "on" | "yes" | "true")
+            } else {
+                false
+            }
+        } else {
+            false
+        };
+
+        let replication = if hot_replica {
+            Replication::HotStandby
+        } else if let Some(lsn) = spec.cluster.settings.find("recovery_target_lsn") {
+            Replication::Static {
+                lsn: Lsn::from_str(&lsn)?,
+            }
+        } else {
+            Replication::Primary
+        };
+
        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;

-        info!("starting safekeepers syncing");
-        let lsn = self
-            .sync_safekeepers(pspec.storage_auth_token.clone())
-            .with_context(|| "failed to sync safekeepers")?;
-        info!("safekeepers synced at LSN {}", lsn);
+        // Syncing safekeepers is only safe with primary nodes: if a primary
+        // is already connected it will be kicked out, so a secondary (standby)
+        // cannot sync safekeepers.
+        let lsn = match &replication {
+            Replication::Primary => {
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
+                info!("safekeepers synced at LSN {}", lsn);
+                lsn
+            }
+            Replication::Static { lsn } => {
+                info!("Starting read-only node at static LSN {}", lsn);
+                *lsn
+            }
+            Replication::HotStandby => {
+                info!("Initializing standby from latest Pageserver LSN");
+                Lsn(0)
+            }
+        };

        info!(
            "getting basebackup@{} from pageserver {}",
@@ -275,6 +321,13 @@ impl ComputeNode {
        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;

+        match &replication {
+            Replication::Primary | Replication::Static { .. } => {}
+            Replication::HotStandby => {
+                add_standby_signal(pgdata_path)?;
+            }
+        }
+
        Ok(())
    }

@@ -425,7 +478,7 @@ impl ComputeNode {
                .unwrap()
                .as_millis() as u64;
            state.metrics.total_startup_ms = startup_end_time
-                .signed_duration_since(self.start_time)
+                .signed_duration_since(compute_state.start_time)
                .to_std()
                .unwrap()
                .as_millis() as u64;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -18,6 +18,7 @@ use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
    ComputeStatusResponse {
+        start_time: state.start_time,
        tenant: state
            .pspec
            .as_ref()
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -152,11 +152,14 @@ components:
      type: object
      description: Compute startup metrics.
      required:
+        - wait_for_spec_ms
        - sync_safekeepers_ms
        - basebackup_ms
        - config_ms
        - total_startup_ms
      properties:
+        wait_for_spec_ms:
+          type: integer
        sync_safekeepers_ms:
          type: integer
        basebackup_ms:
@@ -181,6 +184,13 @@ components:
        - status
        - last_active
      properties:
+        start_time:
+          type: string
+          description: |
+            Time when compute was started. If initially compute was started in the `empty`
+            state and then provided with valid spec, `start_time` will be reset to the
+            moment, when spec was received.
+          example: "2022-10-12T07:20:50.52Z"
        status:
          $ref: '#/components/schemas/ComputeStatus'
        last_active:
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -94,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions {

 pub trait GenericOptionsSearch {
    fn find(&self, name: &str) -> Option<String>;
+    fn find_ref(&self, name: &str) -> Option<&GenericOption>;
 }

 impl GenericOptionsSearch for GenericOptions {
@@ -103,6 +104,12 @@ impl GenericOptionsSearch for GenericOptions {
        let op = ops.iter().find(|s| s.name == name)?;
        op.value.clone()
    }
+
+    /// Lookup option by name, returning ref
+    fn find_ref(&self, name: &str) -> Option<&GenericOption> {
+        let ops = self.as_ref()?;
+        ops.iter().find(|s| s.name == name)
+    }
 }

 pub trait RoleExt {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,45 +1,121 @@
+use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;

 use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
-use tracing::{info, info_span, instrument, span_enabled, warn, Level};
+use reqwest::StatusCode;
+use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

-use compute_api::responses::ControlPlaneSpecResponse;
+use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
 use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};

+// Do control plane request and return response if any. In case of error it
+// returns a bool flag indicating whether it makes sense to retry the request
+// and a string with error message.
+fn do_control_plane_request(
+    uri: &str,
+    jwt: &str,
+) -> Result<ControlPlaneSpecResponse, (bool, String)> {
+    let resp = reqwest::blocking::Client::new()
+        .get(uri)
+        .header("Authorization", jwt)
+        .send()
+        .map_err(|e| {
+            (
+                true,
+                format!("could not perform spec request to control plane: {}", e),
+            )
+        })?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+            Ok(spec_resp) => Ok(spec_resp),
+            Err(e) => Err((
+                true,
+                format!("could not deserialize control plane response: {}", e),
+            )),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => {
+            Err((true, "control plane is temporarily unavailable".to_string()))
+        }
+        StatusCode::BAD_GATEWAY => {
+            // We have a problem with intermittent 502 errors now
+            // https://github.com/neondatabase/cloud/issues/2353
+            // It's fine to retry GET request in this case.
+            Err((true, "control plane request failed with 502".to_string()))
+        }
+        // Another code, likely 500 or 404, means that compute is unknown to the control plane
+        // or some internal failure happened. Doesn't make much sense to retry in this case.
+        _ => Err((
+            false,
+            format!(
+                "unexpected control plane response status code: {}",
+                resp.status()
+            ),
+        )),
+    }
+}
+
 /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
 /// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
+pub fn get_spec_from_control_plane(
+    base_uri: &str,
+    compute_id: &str,
+) -> Result<Option<ComputeSpec>> {
    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
+    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
    };
+    let mut attempt = 1;
+    let mut spec: Result<Option<ComputeSpec>> = Ok(None);
+
    info!("getting spec from control plane: {}", cp_uri);

-    // TODO: check the response. We should distinguish cases when it's
-    // - network error, then retry
-    // - no spec for compute yet, then wait
-    // - compute id is unknown or any other error, then bail out
-    let resp: ControlPlaneSpecResponse = reqwest::blocking::Client::new()
-        .get(cp_uri)
-        .header("Authorization", jwt)
-        .send()
-        .map_err(|e| anyhow!("could not send spec request to control plane: {}", e))?
-        .json()
-        .map_err(|e| anyhow!("could not get compute spec from control plane: {}", e))?;
+    // Do 3 attempts to get spec from the control plane using the following logic:
+    // - network error -> then retry
+    // - compute id is unknown or any other error -> bail out
+    // - no spec for compute yet (Empty state) -> return Ok(None)
+    // - got spec -> return Ok(Some(spec))
+    while attempt < 4 {
+        spec = match do_control_plane_request(&cp_uri, &jwt) {
+            Ok(spec_resp) => match spec_resp.status {
+                ControlPlaneComputeStatus::Empty => Ok(None),
+                ControlPlaneComputeStatus::Attached => {
+                    if let Some(spec) = spec_resp.spec {
+                        Ok(Some(spec))
+                    } else {
+                        bail!("compute is attached, but spec is empty")
+                    }
+                }
+            },
+            Err((retry, msg)) => {
+                if retry {
+                    Err(anyhow!(msg))
+                } else {
+                    bail!(msg);
+                }
+            }
+        };

-    if let Some(spec) = resp.spec {
-        Ok(spec)
-    } else {
-        bail!("could not get compute spec from control plane")
+        if let Err(e) = &spec {
+            error!("attempt {} to get spec failed with: {}", attempt, e);
+        } else {
+            return spec;
+        }
+
+        attempt += 1;
+        std::thread::sleep(std::time::Duration::from_millis(100));
    }
+
+    // All attempts failed, return error.
+    spec
 }

 /// It takes cluster specification and does the following:
@@ -70,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

+/// Create a standby.signal file
+pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
+    // XXX: consider making it a part of spec.json
+    info!("adding standby.signal");
+    let signalfile = pgdata_path.join("standby.signal");
+
+    if !signalfile.exists() {
+        info!("created standby.signal");
+        File::create(signalfile)?;
+    } else {
+        info!("reused pre-existing standby.signal");
+    }
+    Ok(())
+}
+
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,6 +8,7 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint::ComputeMode;
 use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
@@ -474,7 +475,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;

            println!("Creating endpoint for imported timeline ...");
-            cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?;
+            cplane.new_endpoint(
+                tenant_id,
+                name,
+                timeline_id,
+                None,
+                pg_version,
+                ComputeMode::Primary,
+            )?;
            println!("Done");
        }
        Some(("branch", branch_match)) => {
@@ -560,20 +568,20 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .iter()
                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
            {
-                let lsn_str = match endpoint.lsn {
-                    None => {
-                        // -> primary endpoint
+                let lsn_str = match endpoint.mode {
+                    ComputeMode::Static(lsn) => {
+                        // -> read-only endpoint
+                        // Use the node's LSN.
+                        lsn.to_string()
+                    }
+                    _ => {
+                        // -> primary endpoint or hot replica
                        // Use the LSN at the end of the timeline.
                        timeline_infos
                            .get(&endpoint.timeline_id)
                            .map(|bi| bi.last_record_lsn.to_string())
                            .unwrap_or_else(|| "?".to_string())
                    }
-                    Some(lsn) => {
-                        // -> read-only endpoint
-                        // Use the endpoint's LSN.
-                        lsn.to_string()
-                    }
                };

                let branch_name = timeline_name_mappings
@@ -619,7 +627,19 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?;
+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
+            let mode = match (lsn, hot_standby) {
+                (Some(lsn), false) => ComputeMode::Static(lsn),
+                (None, true) => ComputeMode::Replica,
+                (None, false) => ComputeMode::Primary,
+                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+            };
+
+            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
        }
        "start" => {
            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
@@ -637,7 +657,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                None
            };

+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
            if let Some(endpoint) = endpoint {
+                match (&endpoint.mode, hot_standby) {
+                    (ComputeMode::Static(_), true) => {
+                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
+                    }
+                    (ComputeMode::Primary, true) => {
+                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                    }
+                    _ => {}
+                }
                println!("Starting existing endpoint {endpoint_id}...");
                endpoint.start(&auth_token)?;
            } else {
@@ -659,6 +693,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    .get_one::<u32>("pg-version")
                    .copied()
                    .context("Failed to `pg-version` from the argument string")?;
+
+                let mode = match (lsn, hot_standby) {
+                    (Some(lsn), false) => ComputeMode::Static(lsn),
+                    (None, true) => ComputeMode::Replica,
+                    (None, false) => ComputeMode::Primary,
+                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+                };
+
                // when used with custom port this results in non obvious behaviour
                // port is remembered from first start command, i e
                // start --port X
@@ -670,9 +712,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    tenant_id,
                    endpoint_id,
                    timeline_id,
-                    lsn,
                    port,
                    pg_version,
+                    mode,
                )?;
                ep.start(&auth_token)?;
            }
@@ -928,6 +970,12 @@ fn cli() -> Command {
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
        .required(false);

+    let hot_standby_arg = Arg::new("hot-standby")
+        .value_parser(value_parser!(bool))
+        .long("hot-standby")
+        .help("If set, the node will be a hot replica on the specified timeline")
+        .required(false);
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1052,6 +1100,7 @@ fn cli() -> Command {
                            .long("config-only")
                            .required(false))
                    .arg(pg_version_arg.clone())
+                    .arg(hot_standby_arg.clone())
                )
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1062,6 +1111,7 @@ fn cli() -> Command {
                    .arg(lsn_arg)
                    .arg(port_arg)
                    .arg(pg_version_arg)
+                    .arg(hot_standby_arg)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -11,15 +11,31 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

-use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
+use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

+// contents of a endpoint.json file
+#[serde_as]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+pub struct EndpointConf {
+    name: String,
+    #[serde_as(as = "DisplayFromStr")]
+    tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    timeline_id: TimelineId,
+    mode: ComputeMode,
+    port: u16,
+    pg_version: u32,
+}
+
 //
 // ComputeControlPlane
 //
@@ -68,23 +84,34 @@ impl ComputeControlPlane {
        tenant_id: TenantId,
        name: &str,
        timeline_id: TimelineId,
-        lsn: Option<Lsn>,
        port: Option<u16>,
        pg_version: u32,
+        mode: ComputeMode,
    ) -> Result<Arc<Endpoint>> {
        let port = port.unwrap_or_else(|| self.get_port());
+
        let ep = Arc::new(Endpoint {
            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            timeline_id,
-            lsn,
+            mode,
            tenant_id,
            pg_version,
        });
-
        ep.create_pgdata()?;
+        std::fs::write(
+            ep.endpoint_path().join("endpoint.json"),
+            serde_json::to_string_pretty(&EndpointConf {
+                name: name.to_string(),
+                tenant_id,
+                timeline_id,
+                mode,
+                port,
+                pg_version,
+            })?,
+        )?;
        ep.setup_pg_conf()?;

        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
@@ -95,6 +122,19 @@ impl ComputeControlPlane {

 ///////////////////////////////////////////////////////////////////////////////

+#[serde_as]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
+pub enum ComputeMode {
+    // Regular read-write node
+    Primary,
+    // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN
+    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
+    // Hot standby; read-only replica.
+    // Future versions may want to distinguish between replicas with hot standby
+    // feedback and other kinds of replication configurations.
+    Replica,
+}
+
 #[derive(Debug)]
 pub struct Endpoint {
    /// used as the directory name
@@ -102,7 +142,7 @@ pub struct Endpoint {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary.
-    pub lsn: Option<Lsn>,
+    pub mode: ComputeMode,

    // port and address of the Postgres server
    pub address: SocketAddr,
@@ -131,42 +171,20 @@ impl Endpoint {
        let fname = entry.file_name();
        let name = fname.to_str().unwrap().to_string();

-        // Read config file into memory
-        let cfg_path = entry.path().join("pgdata").join("postgresql.conf");
-        let cfg_path_str = cfg_path.to_string_lossy();
-        let mut conf_file = File::open(&cfg_path)
-            .with_context(|| format!("failed to open config file in {}", cfg_path_str))?;
-        let conf = PostgresConf::read(&mut conf_file)
-            .with_context(|| format!("failed to read config file in {}", cfg_path_str))?;
-
-        // Read a few options from the config file
-        let context = format!("in config file {}", cfg_path_str);
-        let port: u16 = conf.parse_field("port", &context)?;
-        let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
-        let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
-
-        // Read postgres version from PG_VERSION file to determine which postgres version binary to use.
-        // If it doesn't exist, assume broken data directory and use default pg version.
-        let pg_version_path = entry.path().join("PG_VERSION");
-
-        let pg_version_str =
-            fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string());
-        let pg_version = u32::from_str(&pg_version_str)?;
-
-        // parse recovery_target_lsn, if any
-        let recovery_target_lsn: Option<Lsn> =
-            conf.parse_field_optional("recovery_target_lsn", &context)?;
+        // Read the endpoint.json file
+        let conf: EndpointConf =
+            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

        // ok now
        Ok(Endpoint {
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
+            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
            name,
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
-            timeline_id,
-            lsn: recovery_target_lsn,
-            tenant_id,
-            pg_version,
+            timeline_id: conf.timeline_id,
+            mode: conf.mode,
+            tenant_id: conf.tenant_id,
+            pg_version: conf.pg_version,
        })
    }

@@ -299,50 +317,83 @@ impl Endpoint {
        conf.append("neon.pageserver_connstring", &pageserver_connstr);
        conf.append("neon.tenant_id", &self.tenant_id.to_string());
        conf.append("neon.timeline_id", &self.timeline_id.to_string());
-        if let Some(lsn) = self.lsn {
-            conf.append("recovery_target_lsn", &lsn.to_string());
-        }

        conf.append_line("");
-        // Configure backpressure
-        // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
-        //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
-        //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
-        //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
-        //   updates pages are not requested from pageserver.
-        // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
-        //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
-        //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
-        //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
-        // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
-        //   To be able to restore database in case of pageserver node crash, safekeeper should not
-        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
-        //   (if they are not able to upload WAL to S3).
-        conf.append("max_replication_write_lag", "15MB");
-        conf.append("max_replication_flush_lag", "10GB");
+        // Replication-related configurations, such as WAL sending
+        match &self.mode {
+            ComputeMode::Primary => {
+                // Configure backpressure
+                // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
+                //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
+                //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
+                //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
+                //   updates pages are not requested from pageserver.
+                // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
+                //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
+                //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
+                //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
+                // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
+                //   To be able to restore database in case of pageserver node crash, safekeeper should not
+                //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
+                //   (if they are not able to upload WAL to S3).
+                conf.append("max_replication_write_lag", "15MB");
+                conf.append("max_replication_flush_lag", "10GB");

-        if !self.env.safekeepers.is_empty() {
-            // Configure Postgres to connect to the safekeepers
-            conf.append("synchronous_standby_names", "walproposer");
+                if !self.env.safekeepers.is_empty() {
+                    // Configure Postgres to connect to the safekeepers
+                    conf.append("synchronous_standby_names", "walproposer");

-            let safekeepers = self
-                .env
-                .safekeepers
-                .iter()
-                .map(|sk| format!("localhost:{}", sk.pg_port))
-                .collect::<Vec<String>>()
-                .join(",");
-            conf.append("neon.safekeepers", &safekeepers);
-        } else {
-            // We only use setup without safekeepers for tests,
-            // and don't care about data durability on pageserver,
-            // so set more relaxed synchronous_commit.
-            conf.append("synchronous_commit", "remote_write");
+                    let safekeepers = self
+                        .env
+                        .safekeepers
+                        .iter()
+                        .map(|sk| format!("localhost:{}", sk.pg_port))
+                        .collect::<Vec<String>>()
+                        .join(",");
+                    conf.append("neon.safekeepers", &safekeepers);
+                } else {
+                    // We only use setup without safekeepers for tests,
+                    // and don't care about data durability on pageserver,
+                    // so set more relaxed synchronous_commit.
+                    conf.append("synchronous_commit", "remote_write");

-            // Configure the node to stream WAL directly to the pageserver
-            // This isn't really a supported configuration, but can be useful for
-            // testing.
-            conf.append("synchronous_standby_names", "pageserver");
+                    // Configure the node to stream WAL directly to the pageserver
+                    // This isn't really a supported configuration, but can be useful for
+                    // testing.
+                    conf.append("synchronous_standby_names", "pageserver");
+                }
+            }
+            ComputeMode::Static(lsn) => {
+                conf.append("recovery_target_lsn", &lsn.to_string());
+            }
+            ComputeMode::Replica => {
+                assert!(!self.env.safekeepers.is_empty());
+
+                // TODO: use future host field from safekeeper spec
+                // Pass the list of safekeepers to the replica so that it can connect to any of them,
+                // whichever is availiable.
+                let sk_ports = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .map(|x| x.pg_port.to_string())
+                    .collect::<Vec<_>>()
+                    .join(",");
+                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
+
+                let connstr = format!(
+                    "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true",
+                    sk_hosts,
+                    sk_ports,
+                    &self.timeline_id.to_string(),
+                    &self.tenant_id.to_string(),
+                );
+
+                let slot_name = format!("repl_{}_", self.timeline_id);
+                conf.append("primary_conninfo", connstr.as_str());
+                conf.append("primary_slot_name", slot_name.as_str());
+                conf.append("hot_standby", "on");
+            }
        }

        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
@@ -355,21 +406,27 @@ impl Endpoint {
    }

    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
-        let backup_lsn = if let Some(lsn) = self.lsn {
-            Some(lsn)
-        } else if !self.env.safekeepers.is_empty() {
-            // LSN 0 means that it is bootstrap and we need to download just
-            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
-            // procedure evolves quite actively right now, so let's think about it again
-            // when things would be more stable (TODO).
-            let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
-            if lsn == Lsn(0) {
-                None
-            } else {
-                Some(lsn)
+        let backup_lsn = match &self.mode {
+            ComputeMode::Primary => {
+                if !self.env.safekeepers.is_empty() {
+                    // LSN 0 means that it is bootstrap and we need to download just
+                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
+                    // procedure evolves quite actively right now, so let's think about it again
+                    // when things would be more stable (TODO).
+                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
+                    if lsn == Lsn(0) {
+                        None
+                    } else {
+                        Some(lsn)
+                    }
+                } else {
+                    None
+                }
+            }
+            ComputeMode::Static(lsn) => Some(*lsn),
+            ComputeMode::Replica => {
+                None // Take the latest snapshot available to start with
            }
-        } else {
-            None
        };

        self.do_basebackup(backup_lsn)?;
@@ -466,7 +523,7 @@ impl Endpoint {
        // 3. Load basebackup
        self.load_basebackup(auth_token)?;

-        if self.lsn.is_some() {
+        if self.mode != ComputeMode::Primary {
            File::create(self.pgdata().join("standby.signal"))?;
        }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -359,8 +359,8 @@ impl PageServerNode {
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
-                .get("eviction_policy")
-                .map(|x| serde_json::from_str(x))
+                .remove("eviction_policy")
+                .map(serde_json::from_str)
                .transpose()
                .context("Failed to parse 'eviction_policy' json")?,
            min_resident_size_override: settings
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -13,7 +13,7 @@ use std::io::BufRead;
 use std::str::FromStr;

 /// In-memory representation of a postgresql.conf file
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct PostgresConf {
    lines: Vec<String>,
    hash: HashMap<String, String>,
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -28,11 +28,6 @@
                "value": "replica",
                "vartype": "enum"
            },
-            {
-                "name": "hot_standby",
-                "value": "on",
-                "vartype": "bool"
-            },
            {
                "name": "wal_log_hints",
                "value": "on",
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,6 +14,7 @@ pub struct GenericAPIError {
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
+    pub start_time: DateTime<Utc>,
    pub tenant: Option<String>,
    pub timeline: Option<String>,
    pub status: ComputeStatus,
@@ -63,6 +64,7 @@ where
 /// Response of the /metrics.json API
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
+    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
    pub basebackup_ms: u64,
    pub config_ms: u64,
@@ -75,4 +77,16 @@ pub struct ComputeMetrics {
 #[derive(Deserialize, Debug)]
 pub struct ControlPlaneSpecResponse {
    pub spec: Option<ComputeSpec>,
+    pub status: ControlPlaneComputeStatus,
+}
+
+#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ControlPlaneComputeStatus {
+    // Compute is known to control-plane, but it's not
+    // yet attached to any timeline / endpoint.
+    Empty,
+    // Compute is attached to some timeline / endpoint and
+    // should be able to start with provided spec.
+    Attached,
 }
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -50,11 +50,14 @@ impl QueryError {
    }
 }

+/// Returns true if the given error is a normal consequence of a network issue,
+/// or the client closing the connection. These errors can happen during normal
+/// operations, and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
    use io::ErrorKind::*;
    matches!(
        e.kind(),
-        ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
+        BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
    )
 }

--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -95,10 +95,13 @@ pub fn generate_wal_segment(
    segno: u64,
    system_id: u64,
    pg_version: u32,
+    lsn: Lsn,
 ) -> Result<Bytes, SerializeError> {
+    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));
+
    match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id),
+        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
        _ => Err(SerializeError::BadInput),
    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -195,6 +195,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;

 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
+pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

 /* From fsm_internals.h */
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -270,6 +270,11 @@ impl XLogPageHeaderData {
        use utils::bin_ser::LeSer;
        XLogPageHeaderData::des_from(&mut buf.reader())
    }
+
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
+        use utils::bin_ser::LeSer;
+        self.ser().map(|b| b.into())
+    }
 }

 impl XLogLongPageHeaderData {
@@ -328,22 +333,32 @@ impl CheckPoint {
    }
 }

-//
-// Generate new, empty WAL segment.
-// We need this segment to start compute node.
-//
-pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
+/// Generate new, empty WAL segment, with correct block headers at the first
+/// page of the segment and the page that contains the given LSN.
+/// We need this segment to start compute node.
+pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Bytes, SerializeError> {
    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);

    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+
+    let page_off = lsn.block_offset();
+    let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
+
+    let first_page_only = seg_off < XLOG_BLCKSZ;
+    let (shdr_rem_len, infoflags) = if first_page_only {
+        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
+    } else {
+        (0, 0)
+    };
+
    let hdr = XLogLongPageHeaderData {
        std: {
            XLogPageHeaderData {
                xlp_magic: XLOG_PAGE_MAGIC as u16,
-                xlp_info: pg_constants::XLP_LONG_HEADER,
+                xlp_info: pg_constants::XLP_LONG_HEADER | infoflags,
                xlp_tli: PG_TLI,
                xlp_pageaddr: pageaddr,
-                xlp_rem_len: 0,
+                xlp_rem_len: shdr_rem_len as u32,
                ..Default::default() // Put 0 in padding fields.
            }
        },
@@ -357,6 +372,33 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali

    //zero out the rest of the file
    seg_buf.resize(WAL_SEGMENT_SIZE, 0);
+
+    if !first_page_only {
+        let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
+        let header = XLogPageHeaderData {
+            xlp_magic: XLOG_PAGE_MAGIC as u16,
+            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                pg_constants::XLP_FIRST_IS_CONTRECORD
+            } else {
+                0
+            },
+            xlp_tli: PG_TLI,
+            xlp_pageaddr: lsn.page_lsn().0,
+            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                page_off as u32
+            } else {
+                0u32
+            },
+            ..Default::default() // Put 0 in padding fields.
+        };
+        let hdr_bytes = header.encode()?;
+
+        debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len());
+        debug_assert_ne!(block_offset, 0);
+
+        seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]);
+    }
+
    Ok(seg_buf.freeze())
 }

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,15 +1,13 @@
-use anyhow::*;
-use core::time::Duration;
+use anyhow::{bail, ensure};
 use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::cmp::Ordering;
-use std::fs;
 use std::path::{Path, PathBuf};
-use std::process::{Command, Stdio};
-use std::time::Instant;
+use std::process::Command;
+use std::time::{Duration, Instant};
 use tempfile::{tempdir, TempDir};

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -56,7 +54,7 @@ impl Conf {
        self.datadir.join("pg_wal")
    }

-    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
+    fn new_pg_command(&self, command: impl AsRef<Path>) -> anyhow::Result<Command> {
        let path = self.pg_bin_dir()?.join(command);
        ensure!(path.exists(), "Command {:?} does not exist", path);
        let mut cmd = Command::new(path);
@@ -66,7 +64,7 @@ impl Conf {
        Ok(cmd)
    }

-    pub fn initdb(&self) -> Result<()> {
+    pub fn initdb(&self) -> anyhow::Result<()> {
        if let Some(parent) = self.datadir.parent() {
            info!("Pre-creating parent directory {:?}", parent);
            // Tests may be run concurrently and there may be a race to create `test_output/`.
@@ -80,7 +78,7 @@ impl Conf {
        let output = self
            .new_pg_command("initdb")?
            .arg("-D")
-            .arg(self.datadir.as_os_str())
+            .arg(&self.datadir)
            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
            .output()?;
        debug!("initdb output: {:?}", output);
@@ -93,26 +91,18 @@ impl Conf {
        Ok(())
    }

-    pub fn start_server(&self) -> Result<PostgresServer> {
+    pub fn start_server(&self) -> anyhow::Result<PostgresServer> {
        info!("Starting Postgres server in {:?}", self.datadir);
-        let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| {
-            format!(
-                "Failed to create pg.log file in directory {}",
-                self.datadir.display()
-            )
-        })?;
        let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
        let unix_socket_dir_path = unix_socket_dir.path().to_owned();
        let server_process = self
            .new_pg_command("postgres")?
            .args(["-c", "listen_addresses="])
            .arg("-k")
-            .arg(unix_socket_dir_path.as_os_str())
+            .arg(&unix_socket_dir_path)
            .arg("-D")
-            .arg(self.datadir.as_os_str())
-            .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .arg(&self.datadir)
            .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
-            .stderr(Stdio::from(log_file))
            .spawn()?;
        let server = PostgresServer {
            process: server_process,
@@ -121,7 +111,7 @@ impl Conf {
                let mut c = postgres::Config::new();
                c.host_path(&unix_socket_dir_path);
                c.user("postgres");
-                c.connect_timeout(Duration::from_millis(1000));
+                c.connect_timeout(Duration::from_millis(10000));
                c
            },
        };
@@ -132,7 +122,7 @@ impl Conf {
        &self,
        first_segment_name: &str,
        last_segment_name: &str,
-    ) -> Result<std::process::Output> {
+    ) -> anyhow::Result<std::process::Output> {
        let first_segment_file = self.datadir.join(first_segment_name);
        let last_segment_file = self.datadir.join(last_segment_name);
        info!(
@@ -142,10 +132,7 @@ impl Conf {
        );
        let output = self
            .new_pg_command("pg_waldump")?
-            .args([
-                &first_segment_file.as_os_str(),
-                &last_segment_file.as_os_str(),
-            ])
+            .args([&first_segment_file, &last_segment_file])
            .output()?;
        debug!("waldump output: {:?}", output);
        Ok(output)
@@ -153,10 +140,9 @@ impl Conf {
 }

 impl PostgresServer {
-    pub fn connect_with_timeout(&self) -> Result<Client> {
+    pub fn connect_with_timeout(&self) -> anyhow::Result<Client> {
        let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap();
        while Instant::now() < retry_until {
-            use std::result::Result::Ok;
            if let Ok(client) = self.client_config.connect(postgres::NoTls) {
                return Ok(client);
            }
@@ -173,7 +159,6 @@ impl PostgresServer {

 impl Drop for PostgresServer {
    fn drop(&mut self) {
-        use std::result::Result::Ok;
        match self.process.try_wait() {
            Ok(Some(_)) => return,
            Ok(None) => {
@@ -188,12 +173,12 @@ impl Drop for PostgresServer {
 }

 pub trait PostgresClientExt: postgres::GenericClient {
-    fn pg_current_wal_insert_lsn(&mut self) -> Result<PgLsn> {
+    fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result<PgLsn> {
        Ok(self
            .query_one("SELECT pg_current_wal_insert_lsn()", &[])?
            .get(0))
    }
-    fn pg_current_wal_flush_lsn(&mut self) -> Result<PgLsn> {
+    fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result<PgLsn> {
        Ok(self
            .query_one("SELECT pg_current_wal_flush_lsn()", &[])?
            .get(0))
@@ -202,7 +187,7 @@ pub trait PostgresClientExt: postgres::GenericClient {

 impl<C: postgres::GenericClient> PostgresClientExt for C {}

-pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> {
+pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> {
    client.execute("create extension if not exists neon_test_utils", &[])?;

    let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0);
@@ -236,13 +221,13 @@ pub trait Crafter {
    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
    ///   May include or exclude Lsn(0) and the end-of-wal.
    /// * The expected end-of-wal LSN.
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)>;
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
 }

 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> Result<(Vec<PgLsn>, Option<PgLsn>)>,
-) -> Result<(Vec<PgLsn>, PgLsn)> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    ensure_server_config(client)?;

    let initial_lsn = client.pg_current_wal_insert_lsn()?;
@@ -274,7 +259,7 @@ fn craft_internal<C: postgres::GenericClient>(
 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
            Ok((Vec::new(), None))
@@ -285,7 +270,7 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -307,7 +292,7 @@ impl Crafter for LastWalRecordXlogSwitch {
 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -374,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
 fn craft_single_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> Result<(Vec<PgLsn>, PgLsn)> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -416,7 +401,7 @@ fn craft_single_logical_message(
 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_single_logical_message(client, true)
    }
 }
@@ -424,7 +409,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_single_logical_message(client, false)
    }
 }
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -10,7 +10,6 @@ byteorder.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-serde.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -6,15 +6,10 @@ pub mod framed;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use postgres_protocol::PG_EPOCH;
-use serde::{Deserialize, Serialize};
-use std::{
-    borrow::Cow,
-    collections::HashMap,
-    fmt, io, str,
-    time::{Duration, SystemTime},
-};
-use tracing::{trace, warn};
+use std::{borrow::Cow, collections::HashMap, fmt, io, str};
+
+// re-export for use in utils pageserver_feedback.rs
+pub use postgres_protocol::PG_EPOCH;

 pub type Oid = u32;
 pub type SystemId = u64;
@@ -664,7 +659,7 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolErr
 }

 /// Read cstring from buf, advancing it.
-fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
+pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
    let pos = buf
        .iter()
        .position(|x| *x == 0)
@@ -939,175 +934,10 @@ impl<'a> BeMessage<'a> {
    }
 }

-/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
-/// Serialized in custom flexible key/value format. In replication protocol, it
-/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
-/// Standby status update / Hot standby feedback messages.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct PageserverFeedback {
-    /// Last known size of the timeline. Used to enforce timeline size limit.
-    pub current_timeline_size: u64,
-    /// LSN last received and ingested by the pageserver.
-    pub last_received_lsn: u64,
-    /// LSN up to which data is persisted by the pageserver to its local disc.
-    pub disk_consistent_lsn: u64,
-    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
-    /// consider WAL before it can be removed.
-    pub remote_consistent_lsn: u64,
-    pub replytime: SystemTime,
-}
-
-// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
-// Do not remove previously available fields because this might be backwards incompatible.
-pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
-
-impl PageserverFeedback {
-    pub fn empty() -> PageserverFeedback {
-        PageserverFeedback {
-            current_timeline_size: 0,
-            last_received_lsn: 0,
-            remote_consistent_lsn: 0,
-            disk_consistent_lsn: 0,
-            replytime: SystemTime::now(),
-        }
-    }
-
-    // Serialize PageserverFeedback using custom format
-    // to support protocol extensibility.
-    //
-    // Following layout is used:
-    // char - number of key-value pairs that follow.
-    //
-    // key-value pairs:
-    // null-terminated string - key,
-    // uint32 - value length in bytes
-    // value itself
-    //
-    // TODO: change serialized fields names once all computes migrate to rename.
-    pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
-        buf.put_slice(b"current_timeline_size\0");
-        buf.put_i32(8);
-        buf.put_u64(self.current_timeline_size);
-
-        buf.put_slice(b"ps_writelsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.last_received_lsn);
-        buf.put_slice(b"ps_flushlsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.disk_consistent_lsn);
-        buf.put_slice(b"ps_applylsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.remote_consistent_lsn);
-
-        let timestamp = self
-            .replytime
-            .duration_since(*PG_EPOCH)
-            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
-            .as_micros() as i64;
-
-        buf.put_slice(b"ps_replytime\0");
-        buf.put_i32(8);
-        buf.put_i64(timestamp);
-    }
-
-    // Deserialize PageserverFeedback message
-    // TODO: change serialized fields names once all computes migrate to rename.
-    pub fn parse(mut buf: Bytes) -> PageserverFeedback {
-        let mut rf = PageserverFeedback::empty();
-        let nfields = buf.get_u8();
-        for _ in 0..nfields {
-            let key = read_cstr(&mut buf).unwrap();
-            match key.as_ref() {
-                b"current_timeline_size" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.current_timeline_size = buf.get_u64();
-                }
-                b"ps_writelsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.last_received_lsn = buf.get_u64();
-                }
-                b"ps_flushlsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.disk_consistent_lsn = buf.get_u64();
-                }
-                b"ps_applylsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.remote_consistent_lsn = buf.get_u64();
-                }
-                b"ps_replytime" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    let raw_time = buf.get_i64();
-                    if raw_time > 0 {
-                        rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
-                    } else {
-                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
-                    }
-                }
-                _ => {
-                    let len = buf.get_i32();
-                    warn!(
-                        "PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
-                        String::from_utf8_lossy(key.as_ref())
-                    );
-                    buf.advance(len as usize);
-                }
-            }
-        }
-        trace!("PageserverFeedback parsed is {:?}", rf);
-        rf
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;

-    #[test]
-    fn test_replication_feedback_serialization() {
-        let mut rf = PageserverFeedback::empty();
-        // Fill rf with some values
-        rf.current_timeline_size = 12345678;
-        // Set rounded time to be able to compare it with deserialized value,
-        // because it is rounded up to microseconds during serialization.
-        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
-        let mut data = BytesMut::new();
-        rf.serialize(&mut data);
-
-        let rf_parsed = PageserverFeedback::parse(data.freeze());
-        assert_eq!(rf, rf_parsed);
-    }
-
-    #[test]
-    fn test_replication_feedback_unknown_key() {
-        let mut rf = PageserverFeedback::empty();
-        // Fill rf with some values
-        rf.current_timeline_size = 12345678;
-        // Set rounded time to be able to compare it with deserialized value,
-        // because it is rounded up to microseconds during serialization.
-        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
-        let mut data = BytesMut::new();
-        rf.serialize(&mut data);
-
-        // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
-            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
-        }
-
-        data.put_slice(b"new_field_one\0");
-        data.put_i32(8);
-        data.put_u64(42);
-
-        // Parse serialized data and check that new field is not parsed
-        let rf_parsed = PageserverFeedback::parse(data.freeze());
-        assert_eq!(rf, rf_parsed);
-    }
-
    #[test]
    fn test_startup_message_params_options_escaped() {
        fn split_options(params: &StartupMessageParams) -> Vec<Cow<'_, str>> {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -128,6 +128,15 @@ impl RemoteStorage for LocalFs {
        // We need this dance with sort of durable rename (without fsyncs)
        // to prevent partial uploads. This was really hit when pageserver shutdown
        // cancelled the upload and partial file was left on the fs
+        // NOTE: Because temp file suffix always the same this operation is racy.
+        // Two concurrent operations can lead to the following sequence:
+        // T1: write(temp)
+        // T2: write(temp) -> overwrites the content
+        // T1: rename(temp, dst) -> succeeds
+        // T2: rename(temp, dst) -> fails, temp no longet exists
+        // This can be solved by supplying unique temp suffix every time, but this situation
+        // is not normall in the first place, the error can help (and helped at least once)
+        // to discover bugs in upper level synchronization.
        let temp_file_path =
            path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX);
        let mut destination = io::BufWriter::new(
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -99,7 +99,11 @@ struct S3WithTestBlobs {
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3 {
    async fn setup() -> Self {
-        utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed");
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -11,6 +11,7 @@ async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
+chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
@@ -27,7 +28,8 @@ signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
-tracing-subscriber = { workspace = true, features = ["json"] }
+tracing-error.workspace = true
+tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
 serde_with.workspace = true
 strum.workspace = true
@@ -35,6 +37,7 @@ strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true

+pq_proto.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -76,6 +76,7 @@ where

        let log_quietly = method == Method::GET;
        async move {
+            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
            if log_quietly {
                debug!("Handling request");
            } else {
@@ -87,7 +88,11 @@ where
            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
            //
            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
-            match (self.0)(request).await {
+            let res = (self.0)(request).await;
+
+            cancellation_guard.disarm();
+
+            match res {
                Ok(response) => {
                    let response_status = response.status();
                    if log_quietly && response_status.is_success() {
@@ -105,6 +110,40 @@ where
    }
 }

+/// Drop guard to WARN in case the request was dropped before completion.
+struct RequestCancelled {
+    warn: Option<tracing::Span>,
+}
+
+impl RequestCancelled {
+    /// Create the drop guard using the [`tracing::Span::current`] as the span.
+    fn warn_when_dropped_without_responding() -> Self {
+        RequestCancelled {
+            warn: Some(tracing::Span::current()),
+        }
+    }
+
+    /// Consume the drop guard without logging anything.
+    fn disarm(mut self) {
+        self.warn = None;
+    }
+}
+
+impl Drop for RequestCancelled {
+    fn drop(&mut self) {
+        if std::thread::panicking() {
+            // we are unwinding due to panicking, assume we are not dropped for cancellation
+        } else if let Some(span) = self.warn.take() {
+            // the span has all of the info already, but the outer `.instrument(span)` has already
+            // been dropped, so we need to manually re-enter it for this message.
+            //
+            // this is what the instrument would do before polling so it is fine.
+            let _g = span.entered();
+            warn!("request was dropped before completing");
+        }
+    }
+}
+
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -1,9 +1,7 @@
-use std::fmt::Display;
-
 use anyhow::Context;
 use bytes::Buf;
 use hyper::{header, Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};

 use super::error::ApiError;

@@ -33,12 +31,3 @@ pub fn json_response<T: Serialize>(
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
 }
-
-/// Serialize through Display trait.
-pub fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-    F: Display,
-{
-    s.serialize_str(&format!("{}", z))
-}
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -265,6 +265,26 @@ impl fmt::Display for TenantTimelineId {
    }
 }

+impl FromStr for TenantTimelineId {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut parts = s.split('/');
+        let tenant_id = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain tenant_id"))?
+            .parse()?;
+        let timeline_id = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain timeline_id"))?
+            .parse()?;
+        if parts.next().is_some() {
+            anyhow::bail!("TenantTimelineId must contain only tenant_id and timeline_id");
+        }
+        Ok(TenantTimelineId::new(tenant_id, timeline_id))
+    }
+}
+
 // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued
 // by the console.
 #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -54,6 +54,10 @@ pub mod measured_stream;
 pub mod serde_percent;
 pub mod serde_regex;

+pub mod pageserver_feedback;
+
+pub mod tracing_span_assert;
+
 /// use with fail::cfg("$name", "return(2000)")
 #[macro_export]
 macro_rules! failpoint_sleep_millis_async {
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,6 +1,7 @@
 use std::str::FromStr;

 use anyhow::Context;
+use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};

 #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
@@ -23,24 +24,81 @@ impl LogFormat {
    }
 }

-pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
-    let default_filter_str = "info";
+static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+    metrics::register_int_counter_vec!(
+        "libmetrics_tracing_event_count",
+        "Number of tracing events, by level",
+        &["level"]
+    )
+    .expect("failed to define metric")
+});

+struct TracingEventCountLayer(&'static metrics::IntCounterVec);
+
+impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
+where
+    S: tracing::Subscriber,
+{
+    fn on_event(
+        &self,
+        event: &tracing::Event<'_>,
+        _ctx: tracing_subscriber::layer::Context<'_, S>,
+    ) {
+        let level = event.metadata().level();
+        let level = match *level {
+            tracing::Level::ERROR => "error",
+            tracing::Level::WARN => "warn",
+            tracing::Level::INFO => "info",
+            tracing::Level::DEBUG => "debug",
+            tracing::Level::TRACE => "trace",
+        };
+        self.0.with_label_values(&[level]).inc();
+    }
+}
+
+/// Whether to add the `tracing_error` crate's `ErrorLayer`
+/// to the global tracing subscriber.
+///
+pub enum TracingErrorLayerEnablement {
+    /// Do not add the `ErrorLayer`.
+    Disabled,
+    /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
+    EnableWithRustLogFilter,
+}
+
+pub fn init(
+    log_format: LogFormat,
+    tracing_error_layer_enablement: TracingErrorLayerEnablement,
+) -> anyhow::Result<()> {
    // We fall back to printing all spans at info-level or above if
    // the RUST_LOG environment variable is not set.
-    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };

-    let base_logger = tracing_subscriber::fmt()
-        .with_env_filter(env_filter)
-        .with_target(false)
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_writer(std::io::stdout);
-
-    match log_format {
-        LogFormat::Json => base_logger.json().init(),
-        LogFormat::Plain => base_logger.init(),
-        LogFormat::Test => base_logger.with_test_writer().init(),
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    let r = tracing_subscriber::registry();
+    let r = r.with({
+        let log_layer = tracing_subscriber::fmt::layer()
+            .with_target(false)
+            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_writer(std::io::stdout);
+        let log_layer = match log_format {
+            LogFormat::Json => log_layer.json().boxed(),
+            LogFormat::Plain => log_layer.boxed(),
+            LogFormat::Test => log_layer.with_test_writer().boxed(),
+        };
+        log_layer.with_filter(rust_log_env_filter())
+    });
+    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
+    match tracing_error_layer_enablement {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter => r
+            .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
+            .init(),
+        TracingErrorLayerEnablement::Disabled => r.init(),
    }

    Ok(())
@@ -157,3 +215,33 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
        <Self as std::fmt::Display>::fmt(self, f)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use metrics::{core::Opts, IntCounterVec};
+
+    use super::TracingEventCountLayer;
+
+    #[test]
+    fn tracing_event_count_metric() {
+        let counter_vec =
+            IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
+        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
+        let layer = TracingEventCountLayer(counter_vec);
+        use tracing_subscriber::prelude::*;
+
+        tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
+            tracing::trace!("foo");
+            tracing::debug!("foo");
+            tracing::info!("foo");
+            tracing::warn!("foo");
+            tracing::error!("foo");
+        });
+
+        assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
+    }
+}
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -62,29 +62,48 @@ impl Lsn {
    }

    /// Compute the offset into a segment
+    #[inline]
    pub fn segment_offset(self, seg_sz: usize) -> usize {
        (self.0 % seg_sz as u64) as usize
    }

    /// Compute LSN of the segment start.
+    #[inline]
    pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
        Lsn(self.0 - (self.0 % seg_sz as u64))
    }

    /// Compute the segment number
+    #[inline]
    pub fn segment_number(self, seg_sz: usize) -> u64 {
        self.0 / seg_sz as u64
    }

    /// Compute the offset into a block
+    #[inline]
    pub fn block_offset(self) -> u64 {
        const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
        self.0 % BLCKSZ
    }

+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_lsn(self) -> Lsn {
+        Lsn(self.0 - self.block_offset())
+    }
+
+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 {
+        (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0
+    }
+
    /// Compute the bytes remaining in this block
    ///
    /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`.
+    #[inline]
    pub fn remaining_in_block(self) -> u64 {
        const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
        BLCKSZ - (self.0 % BLCKSZ)
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -0,0 +1,214 @@
+use std::time::{Duration, SystemTime};
+
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use pq_proto::{read_cstr, PG_EPOCH};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use tracing::{trace, warn};
+
+use crate::lsn::Lsn;
+
+/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+/// Serialized in custom flexible key/value format. In replication protocol, it
+/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
+/// Standby status update / Hot standby feedback messages.
+///
+/// serde Serialize is used only for human readable dump to json (e.g. in
+/// safekeepers debug_dump).
+#[serde_as]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PageserverFeedback {
+    /// Last known size of the timeline. Used to enforce timeline size limit.
+    pub current_timeline_size: u64,
+    /// LSN last received and ingested by the pageserver. Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
+    pub last_received_lsn: Lsn,
+    /// LSN up to which data is persisted by the pageserver to its local disc.
+    /// Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
+    pub disk_consistent_lsn: Lsn,
+    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
+    /// consider WAL before it can be removed.
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn: Lsn,
+    // Serialize with RFC3339 format.
+    #[serde(with = "serde_systemtime")]
+    pub replytime: SystemTime,
+}
+
+// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
+// Do not remove previously available fields because this might be backwards incompatible.
+pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+
+impl PageserverFeedback {
+    pub fn empty() -> PageserverFeedback {
+        PageserverFeedback {
+            current_timeline_size: 0,
+            last_received_lsn: Lsn::INVALID,
+            remote_consistent_lsn: Lsn::INVALID,
+            disk_consistent_lsn: Lsn::INVALID,
+            replytime: *PG_EPOCH,
+        }
+    }
+
+    // Serialize PageserverFeedback using custom format
+    // to support protocol extensibility.
+    //
+    // Following layout is used:
+    // char - number of key-value pairs that follow.
+    //
+    // key-value pairs:
+    // null-terminated string - key,
+    // uint32 - value length in bytes
+    // value itself
+    //
+    // TODO: change serialized fields names once all computes migrate to rename.
+    pub fn serialize(&self, buf: &mut BytesMut) {
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        buf.put_slice(b"current_timeline_size\0");
+        buf.put_i32(8);
+        buf.put_u64(self.current_timeline_size);
+
+        buf.put_slice(b"ps_writelsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.last_received_lsn.0);
+        buf.put_slice(b"ps_flushlsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.disk_consistent_lsn.0);
+        buf.put_slice(b"ps_applylsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.remote_consistent_lsn.0);
+
+        let timestamp = self
+            .replytime
+            .duration_since(*PG_EPOCH)
+            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
+            .as_micros() as i64;
+
+        buf.put_slice(b"ps_replytime\0");
+        buf.put_i32(8);
+        buf.put_i64(timestamp);
+    }
+
+    // Deserialize PageserverFeedback message
+    // TODO: change serialized fields names once all computes migrate to rename.
+    pub fn parse(mut buf: Bytes) -> PageserverFeedback {
+        let mut rf = PageserverFeedback::empty();
+        let nfields = buf.get_u8();
+        for _ in 0..nfields {
+            let key = read_cstr(&mut buf).unwrap();
+            match key.as_ref() {
+                b"current_timeline_size" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.current_timeline_size = buf.get_u64();
+                }
+                b"ps_writelsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.last_received_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_flushlsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.disk_consistent_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_applylsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.remote_consistent_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_replytime" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    let raw_time = buf.get_i64();
+                    if raw_time > 0 {
+                        rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
+                    } else {
+                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
+                    }
+                }
+                _ => {
+                    let len = buf.get_i32();
+                    warn!(
+                        "PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
+                        String::from_utf8_lossy(key.as_ref())
+                    );
+                    buf.advance(len as usize);
+                }
+            }
+        }
+        trace!("PageserverFeedback parsed is {:?}", rf);
+        rf
+    }
+}
+
+mod serde_systemtime {
+    use std::time::SystemTime;
+
+    use chrono::{DateTime, Utc};
+    use serde::{Deserialize, Deserializer, Serializer};
+
+    pub fn serialize<S>(ts: &SystemTime, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let chrono_dt: DateTime<Utc> = (*ts).into();
+        serializer.serialize_str(&chrono_dt.to_rfc3339())
+    }
+
+    pub fn deserialize<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let time: String = Deserialize::deserialize(deserializer)?;
+        Ok(DateTime::parse_from_rfc3339(&time)
+            .map_err(serde::de::Error::custom)?
+            .into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_replication_feedback_serialization() {
+        let mut rf = PageserverFeedback::empty();
+        // Fill rf with some values
+        rf.current_timeline_size = 12345678;
+        // Set rounded time to be able to compare it with deserialized value,
+        // because it is rounded up to microseconds during serialization.
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        let mut data = BytesMut::new();
+        rf.serialize(&mut data);
+
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
+        assert_eq!(rf, rf_parsed);
+    }
+
+    #[test]
+    fn test_replication_feedback_unknown_key() {
+        let mut rf = PageserverFeedback::empty();
+        // Fill rf with some values
+        rf.current_timeline_size = 12345678;
+        // Set rounded time to be able to compare it with deserialized value,
+        // because it is rounded up to microseconds during serialization.
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        let mut data = BytesMut::new();
+        rf.serialize(&mut data);
+
+        // Add an extra field to the buffer and adjust number of keys
+        if let Some(first) = data.first_mut() {
+            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
+        }
+
+        data.put_slice(b"new_field_one\0");
+        data.put_i32(8);
+        data.put_u64(42);
+
+        // Parse serialized data and check that new field is not parsed
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
+        assert_eq!(rf, rf_parsed);
+    }
+}
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -0,0 +1,287 @@
+//! Assert that the current [`tracing::Span`] has a given set of fields.
+//!
+//! # Usage
+//!
+//! ```
+//! use tracing_subscriber::prelude::*;
+//! let registry = tracing_subscriber::registry()
+//!    .with(tracing_error::ErrorLayer::default());
+//!
+//! // Register the registry as the global subscriber.
+//! // In this example, we'll only use it as a thread-local subscriber.
+//! let _guard = tracing::subscriber::set_default(registry);
+//!
+//! // Then, in the main code:
+//!
+//! let span = tracing::info_span!("TestSpan", test_id = 1);
+//! let _guard = span.enter();
+//!
+//! // ... down the call stack
+//!
+//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
+//! match check_fields_present([&extractor]) {
+//!    Ok(()) => {},
+//!    Err(missing) => {
+//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
+//!    }
+//! }
+//! ```
+//!
+//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+//!
+
+use std::{
+    collections::HashSet,
+    fmt::{self},
+    hash::{Hash, Hasher},
+};
+
+pub enum ExtractionResult {
+    Present,
+    Absent,
+}
+
+pub trait Extractor: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> &str;
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
+}
+
+#[derive(Debug)]
+pub struct MultiNameExtractor<const L: usize> {
+    name: &'static str,
+    field_names: [&'static str; L],
+}
+
+impl<const L: usize> MultiNameExtractor<L> {
+    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
+        MultiNameExtractor { name, field_names }
+    }
+}
+impl<const L: usize> Extractor for MultiNameExtractor<L> {
+    fn name(&self) -> &str {
+        self.name
+    }
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
+        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
+            ExtractionResult::Present
+        } else {
+            ExtractionResult::Absent
+        }
+    }
+}
+
+struct MemoryIdentity<'a>(&'a dyn Extractor);
+
+impl<'a> MemoryIdentity<'a> {
+    fn as_ptr(&self) -> *const () {
+        self.0 as *const _ as *const ()
+    }
+}
+impl<'a> PartialEq for MemoryIdentity<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ptr() == other.as_ptr()
+    }
+}
+impl<'a> Eq for MemoryIdentity<'a> {}
+impl<'a> Hash for MemoryIdentity<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_ptr().hash(state);
+    }
+}
+impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
+    }
+}
+
+/// The extractor names passed as keys to [`new`].
+pub fn check_fields_present<const L: usize>(
+    must_be_present: [&dyn Extractor; L],
+) -> Result<(), Vec<&dyn Extractor>> {
+    let mut missing: HashSet<MemoryIdentity> =
+        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+    let trace = tracing_error::SpanTrace::capture();
+    trace.with_spans(|md, _formatted_fields| {
+        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+            ExtractionResult::Present => false,
+            ExtractionResult::Absent => true,
+        });
+        !missing.is_empty() // continue walking up until we've found all missing
+    });
+    if missing.is_empty() {
+        Ok(())
+    } else {
+        Err(missing.into_iter().map(|mi| mi.0).collect())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use tracing_subscriber::prelude::*;
+
+    use super::*;
+
+    struct Setup {
+        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
+        tenant_extractor: MultiNameExtractor<2>,
+        timeline_extractor: MultiNameExtractor<2>,
+    }
+
+    fn setup_current_thread() -> Setup {
+        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
+        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
+
+        let registry = tracing_subscriber::registry()
+            .with(tracing_subscriber::fmt::layer())
+            .with(tracing_error::ErrorLayer::default());
+
+        let guard = tracing::subscriber::set_default(registry);
+
+        Setup {
+            _current_thread_subscriber_guard: guard,
+            tenant_extractor,
+            timeline_extractor,
+        }
+    }
+
+    fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) {
+        let missing: HashSet<MemoryIdentity> =
+            HashSet::from_iter(missing.into_iter().map(MemoryIdentity));
+        let expected: HashSet<MemoryIdentity> =
+            HashSet::from_iter(expected.into_iter().map(MemoryIdentity));
+        assert_eq!(missing, expected);
+    }
+
+    #[test]
+    fn positive_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing =
+            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn positive_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn negative_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn tracing_error_subscriber_not_set_up() {
+        // no setup
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
+        let r = tracing_subscriber::registry().with({
+            tracing_error::ErrorLayer::default().with_filter(
+                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
+                    if md.is_span() && *md.level() == tracing::Level::INFO {
+                        return false;
+                    }
+                    true
+                }),
+            )
+        });
+
+        let _guard = tracing::subscriber::set_default(r);
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -52,6 +52,7 @@ sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -13,7 +13,7 @@ use std::time::Instant;

 use utils::lsn::Lsn;

-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};

 fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
    let mut layer_map = LayerMap::<LayerDescriptor>::default();
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -114,7 +114,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
    c.bench_function("captest_uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
            }
        });
    });
@@ -122,11 +122,11 @@ fn bench_from_captest_env(c: &mut Criterion) {
    // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs.
    c.bench_function("captest_rel_dir_query", |b| {
        b.iter(|| {
-            let result = layer_map.search(
+            let result = black_box(layer_map.search(
                Key::from_hex("000000067F00008000000000000000000001").unwrap(),
                // This LSN is higher than any of the LSNs in the tree
                Lsn::from_str("D0/80208AE1").unwrap(),
-            );
+            ));
            result.unwrap();
        });
    });
@@ -183,7 +183,7 @@ fn bench_from_real_project(c: &mut Criterion) {
    group.bench_function("uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
            }
        });
    });
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
            is_incremental: false,
            short_id: format!("Layer {}", i),
        };
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
@@ -232,7 +232,7 @@ fn bench_sequential(c: &mut Criterion) {
    group.bench_function("uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
            }
        });
    });
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -463,9 +463,13 @@ where
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;

-        let wal_seg =
-            postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
-                .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
+        let wal_seg = postgres_ffi::generate_wal_segment(
+            segno,
+            system_identifier,
+            self.timeline.pg_version,
+            self.lsn,
+        )
+        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..]).await?;
        Ok(())
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -25,6 +25,7 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
@@ -86,8 +87,19 @@ fn main() -> anyhow::Result<()> {
        }
    };

-    // Initialize logging, which must be initialized before the custom panic hook is installed.
-    logging::init(conf.log_format)?;
+    // Initialize logging.
+    //
+    // It must be initialized before the custom panic hook is installed below.
+    //
+    // Regarding tracing_error enablement: at this time, we only use the
+    // tracing_error crate to debug_assert that log spans contain tenant and timeline ids.
+    // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module
+    let tracing_error_layer_enablement = if cfg!(debug_assertions) {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter
+    } else {
+        TracingErrorLayerEnablement::Disabled
+    };
+    logging::init(conf.log_format, tracing_error_layer_enablement)?;

    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
    // disarming this hook on pageserver, because we never tear down tracing.
@@ -226,6 +238,7 @@ fn start_pageserver(
    );
    set_build_info_metric(GIT_VERSION);
    set_launch_timestamp_metric(launch_ts);
+    pageserver::preinitialize_metrics();

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -520,6 +520,43 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_id}/synthetic_size:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: |
+        Calculate tenant's synthetic size
+      responses:
+        "200":
+          description: Tenant's synthetic size
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
@@ -948,6 +985,84 @@ components:
        latest_gc_cutoff_lsn:
          type: string
          format: hex
+
+    SyntheticSizeResponse:
+      type: object
+      required:
+        - id
+        - size
+        - segment_sizes
+        - inputs
+      properties:
+        id:
+          type: string
+          format: hex
+        size:
+          type: integer
+        segment_sizes:
+          type: array
+          items:
+            $ref: "#/components/schemas/SegmentSize"
+        inputs:
+          type: object
+          properties:
+            segments:
+              type: array
+              items:
+                $ref: "#/components/schemas/SegmentData"
+            timeline_inputs:
+              type: array
+              items:
+                $ref: "#/components/schemas/TimelineInput"
+
+    SegmentSize:
+      type: object
+      required:
+        - method
+        - accum_size
+      properties:
+        method:
+          type: string
+        accum_size:
+          type: integer
+
+    SegmentData:
+      type: object
+      required:
+        - segment
+      properties:
+        segment:
+          type: object
+          required:
+            - lsn
+          properties:
+            parent:
+              type: integer
+            lsn:
+              type: integer
+            size:
+              type: integer
+            needed:
+              type: boolean
+        timeline_id:
+          type: string
+          format: hex
+        kind:
+          type: string
+
+    TimelineInput:
+      type: object
+      required:
+        - timeline_id
+      properties:
+        ancestor_id:
+          type: string
+        ancestor_lsn:
+          type: string
+        timeline_id:
+          type: string
+          format: hex
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -143,6 +143,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
                "Cannot delete timeline which has child timelines"
            )),
+            StopUploadQueue(e) => ApiError::InternalServerError(e.into()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -1201,6 +1202,37 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    )
 }

+#[cfg(feature = "testing")]
+async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    #[derive(Debug, serde::Deserialize)]
+    #[serde(rename_all = "lowercase")]
+    enum Level {
+        Error,
+        Warn,
+        Info,
+        Debug,
+        Trace,
+    }
+    #[derive(Debug, serde::Deserialize)]
+    struct Request {
+        level: Level,
+        message: String,
+    }
+    let body: Request = json_request(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+
+    match body.level {
+        Level::Error => tracing::error!(?body.message),
+        Level::Warn => tracing::warn!(?body.message),
+        Level::Info => tracing::info!(?body.message),
+        Level::Debug => tracing::debug!(?body.message),
+        Level::Trace => tracing::trace!(?body.message),
+    }
+
+    json_response(StatusCode::OK, ())
+}
+
 pub fn make_router(
    conf: &'static PageServerConf,
    launch_ts: &'static LaunchTimestamp,
@@ -1341,5 +1373,9 @@ pub fn make_router(
            testing_api!("set tenant state to broken", handle_tenant_break),
        )
        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
+        .post(
+            "/v1/tracing/event",
+            testing_api!("emit a tracing event", post_tracing_event_handler),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -114,7 +114,7 @@ async fn import_rel(
    path: &Path,
    spcoid: Oid,
    dboid: Oid,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
    len: usize,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -200,7 +200,7 @@ async fn import_slru(
    modification: &mut DatadirModification<'_>,
    slru: SlruKind,
    path: &Path,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
    len: usize,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -612,8 +612,8 @@ async fn import_file(
    Ok(None)
 }

-async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
+async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes> {
    let mut buf: Vec<u8> = vec![];
    reader.read_to_end(&mut buf).await?;
-    Ok(Bytes::copy_from_slice(&buf[..]))
+    Ok(Bytes::from(buf))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -44,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

+pub use crate::metrics::preinitialize_metrics;
+
 pub async fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -205,6 +205,15 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_unexpected_ondemand_downloads_count",
+        "Number of unexpected on-demand downloads. \
+         We log more context for each increment, so, forgo any labels in this metric.",
+    )
+    .expect("failed to define a metric")
+});
+
 /// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
@@ -278,14 +287,33 @@ impl EvictionsWithLowResidenceDuration {
        let Some(_counter) = self.counter.take() else {
            return;
        };
-        EVICTIONS_WITH_LOW_RESIDENCE_DURATION
-            .remove_label_values(&[
-                tenant_id,
-                timeline_id,
-                self.data_source,
-                &Self::threshold_label_value(self.threshold),
-            ])
-            .expect("we own the metric, no-one else should remove it");
+
+        let threshold = Self::threshold_label_value(self.threshold);
+
+        let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
+            tenant_id,
+            timeline_id,
+            self.data_source,
+            &threshold,
+        ]);
+
+        match removed {
+            Err(e) => {
+                // this has been hit in staging as
+                // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
+                // because we can be in the drop path already, don't risk:
+                // - "double-panic => illegal instruction" or
+                // - future "drop panick => abort"
+                //
+                // so just nag: (the error has the labels)
+                tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
+            }
+            Ok(()) => {
+                // to help identify cases where we double-remove the same values, let's log all
+                // deletions?
+                tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
+            }
+        }
    }
 }

@@ -350,11 +378,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
-        .expect("failed to define a metric")
-});
-
 // remote storage metrics

 /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
@@ -385,6 +408,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_started",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation is scheduled.",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
+static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_finished",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -435,6 +478,56 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

+// walreceiver metrics
+
+pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_walreceiver_started_connections_total",
+        "Number of started walreceiver connections"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_walreceiver_active_managers",
+        "Number of active walreceiver managers"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_walreceiver_switches_total",
+        "Number of walreceiver manager change_connection calls",
+        &["reason"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_walreceiver_broker_updates_total",
+        "Number of received broker updates in walreceiver"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_walreceiver_candidates_events_total",
+        "Number of walreceiver candidate events",
+        &["event"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
+
+pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
@@ -739,6 +832,8 @@ pub struct RemoteTimelineClientMetrics {
    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
+    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }

 impl RemoteTimelineClientMetrics {
@@ -749,6 +844,8 @@ impl RemoteTimelineClientMetrics {
            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            calls_started_hist: Mutex::new(HashMap::default()),
+            bytes_started_counter: Mutex::new(HashMap::default()),
+            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
@@ -787,6 +884,7 @@ impl RemoteTimelineClientMetrics {
        });
        metric.clone()
    }
+
    fn calls_unfinished_gauge(
        &self,
        file_kind: &RemoteOpFileKind,
@@ -828,32 +926,125 @@ impl RemoteTimelineClientMetrics {
        });
        metric.clone()
    }
+
+    fn bytes_started_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+
+    fn bytes_finished_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+}
+
+#[cfg(test)]
+impl RemoteTimelineClientMetrics {
+    pub fn get_bytes_started_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
+
+    pub fn get_bytes_finished_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
 }

 /// See [`RemoteTimelineClientMetrics::call_begin`].
 #[must_use]
-pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
+pub(crate) struct RemoteTimelineClientCallMetricGuard {
+    /// Decremented on drop.
+    calls_unfinished_metric: Option<IntGauge>,
+    /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
+    bytes_finished: Option<(IntCounter, u64)>,
+}

 impl RemoteTimelineClientCallMetricGuard {
-    /// Consume this guard object without decrementing the metric.
-    /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
+    /// Consume this guard object without performing the metric updates it would do on `drop()`.
+    /// The caller vouches to do the metric updates manually.
    pub fn will_decrement_manually(mut self) {
-        self.0 = None; // prevent drop() from decrementing
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = &mut self;
+        calls_unfinished_metric.take();
+        bytes_finished.take();
    }
 }

 impl Drop for RemoteTimelineClientCallMetricGuard {
    fn drop(&mut self) {
-        if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = self;
+        if let Some(guard) = calls_unfinished_metric.take() {
            guard.dec();
        }
+        if let Some((bytes_finished_metric, value)) = bytes_finished {
+            bytes_finished_metric.inc_by(*value);
+        }
    }
 }

+/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
+/// track the byte size of this call in applicable metric(s).
+pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
+    /// Do not account for this call's byte size in any metrics.
+    /// The `reason` field is there to make the call sites self-documenting
+    /// about why they don't need the metric.
+    DontTrackSize { reason: &'static str },
+    /// Track the byte size of the call in applicable metric(s).
+    Bytes(u64),
+}
+
 impl RemoteTimelineClientMetrics {
-    /// Increment the metrics that track ongoing calls to the remote timeline client instance.
+    /// Update the metrics that change when a call to the remote timeline client instance starts.
    ///
-    /// Drop the returned guard object once the operation is finished to decrement the values.
+    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
    /// is more suitable.
    /// Never do both.
@@ -861,24 +1052,51 @@ impl RemoteTimelineClientMetrics {
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
    ) -> RemoteTimelineClientCallMetricGuard {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
        self.calls_started_hist(file_kind, op_kind)
-            .observe(unfinished_metric.get() as f64);
-        unfinished_metric.inc();
-        RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
+            .observe(calls_unfinished_metric.get() as f64);
+        calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
+
+        let bytes_finished = match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
+                // nothing to do
+                None
+            }
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_started_counter(file_kind, op_kind).inc_by(size);
+                let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
+                Some((finished_counter, size))
+            }
+        };
+        RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric: Some(calls_unfinished_metric),
+            bytes_finished,
+        }
    }

-    /// Manually decrement the metric instead of using the guard object.
+    /// Manually udpate the metrics that track completions, instead of using the guard object.
    /// Using the guard object is generally preferable.
    /// See [`call_begin`] for more context.
-    pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+    pub(crate) fn call_end(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
+    ) {
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
        debug_assert!(
-            unfinished_metric.get() > 0,
+            calls_unfinished_metric.get() > 0,
            "begin and end should cancel out"
        );
-        unfinished_metric.dec();
+        calls_unfinished_metric.dec();
+        match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
+            }
+        }
    }
 }

@@ -891,6 +1109,8 @@ impl Drop for RemoteTimelineClientMetrics {
            remote_operation_time,
            calls_unfinished_gauge,
            calls_started_hist,
+            bytes_started_counter,
+            bytes_finished_counter,
        } = self;
        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
@@ -911,6 +1131,22 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
+        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
        {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -974,3 +1210,10 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
        poll_result
    }
 }
+
+pub fn preinitialize_metrics() {
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -20,7 +20,6 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
-use postgres_backend::PostgresBackendTCP;
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -32,6 +31,7 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -57,7 +57,10 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<Bytes>> + '_ {
+fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
+where
+    IO: AsyncRead + AsyncWrite + Unpin,
+{
    async_stream::try_stream! {
        loop {
            let msg = tokio::select! {
@@ -65,8 +68,8 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<

                _ = task_mgr::shutdown_watcher() => {
                    // We were requested to shut down.
-                    let msg = "pageserver is shutting down".to_string();
-                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None));
+                    let msg = "pageserver is shutting down";
+                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
                    Err(QueryError::Other(anyhow::anyhow!(msg)))
                }

@@ -125,7 +128,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<
 ///
 /// XXX: Currently, any trailing data after the EOF marker prints a warning.
 /// Perhaps it should be a hard error?
-async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
    use tokio::io::AsyncReadExt;
    let mut buf = [0u8; 512];

@@ -245,12 +248,23 @@ async fn page_service_conn_main(
        .set_nodelay(true)
        .context("could not set TCP_NODELAY")?;

+    let peer_addr = socket.peer_addr().context("get peer address")?;
+
+    // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
+    // - long enough for most valid compute connections
+    // - less than infinite to stop us from "leaking" connections to long-gone computes
+    //
+    // no write timeout is used, because the kernel is assumed to error writes after some time.
+    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
+    let socket = std::pin::pin!(socket);
+
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
        .run(&mut conn_handler, task_mgr::shutdown_watcher)
@@ -332,13 +346,16 @@ impl PageServerHandler {
    }

    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_pagerequests(
+    async fn handle_pagerequests<IO>(
        &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
        // NOTE: pagerequests handler exits when connection is closed,
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
@@ -381,7 +398,9 @@ impl PageServerHandler {
                Some(FeMessage::CopyData(bytes)) => bytes,
                Some(FeMessage::Terminate) => break,
                Some(m) => {
-                    anyhow::bail!("unexpected message: {m:?} during COPY");
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "unexpected message: {m:?} during COPY"
+                    )));
                }
                None => break, // client disconnected
            };
@@ -436,16 +455,19 @@ impl PageServerHandler {

    #[allow(clippy::too_many_arguments)]
    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_basebackup(
+    async fn handle_import_basebackup<IO>(
        &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        base_lsn: Lsn,
        _end_lsn: Lsn,
        pg_version: u32,
        ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
@@ -486,15 +508,18 @@ impl PageServerHandler {
    }

    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_wal(
+    async fn handle_import_wal<IO>(
        &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        start_lsn: Lsn,
        end_lsn: Lsn,
        ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
@@ -690,16 +715,21 @@ impl PageServerHandler {

    #[allow(clippy::too_many_arguments)]
    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_basebackup_request(
+    async fn handle_basebackup_request<IO>(
        &mut self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        lsn: Option<Lsn>,
        prev_lsn: Option<Lsn>,
        full_backup: bool,
        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<()>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let started = std::time::Instant::now();
+
        // check that the timeline exists
        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
@@ -712,6 +742,8 @@ impl PageServerHandler {
                .context("invalid basebackup lsn")?;
        }

+        let lsn_awaited_after = started.elapsed();
+
        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        pgb.flush().await?;
@@ -732,7 +764,17 @@ impl PageServerHandler {

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
        pgb.flush().await?;
-        info!("basebackup complete");
+
+        let basebackup_after = started
+            .elapsed()
+            .checked_sub(lsn_awaited_after)
+            .unwrap_or(Duration::ZERO);
+
+        info!(
+            lsn_await_millis = lsn_awaited_after.as_millis(),
+            basebackup_millis = basebackup_after.as_millis(),
+            "basebackup complete"
+        );

        Ok(())
    }
@@ -756,10 +798,13 @@ impl PageServerHandler {
 }

 #[async_trait::async_trait]
-impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
+impl<IO> postgres_backend::Handler<IO> for PageServerHandler
+where
+    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+{
    fn check_auth_jwt(
        &mut self,
-        _pgb: &mut PostgresBackendTCP,
+        _pgb: &mut PostgresBackend<IO>,
        jwt_response: &[u8],
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
@@ -787,7 +832,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {

    fn startup(
        &mut self,
-        _pgb: &mut PostgresBackendTCP,
+        _pgb: &mut PostgresBackend<IO>,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
        Ok(())
@@ -795,7 +840,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {

    async fn process_query(
        &mut self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
        let ctx = self.connection_ctx.attached_child();
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -58,6 +58,7 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
@@ -118,6 +119,10 @@ pub struct Tenant {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

+    /// The value creation timestamp, used to measure activation delay, see:
+    /// <https://github.com/neondatabase/neon/issues/4025>
+    loading_started_at: Instant,
+
    state: watch::Sender<TenantState>,

    // Overridden tenant-specific config parameters.
@@ -267,10 +272,7 @@ impl UninitializedTimeline<'_> {
            .await
            .context("Failed to flush after basebackup import")?;

-        // Initialize without loading the layer map. We started with an empty layer map, and already
-        // updated it for the layers that we created during the import.
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        self.initialize_with_lock(ctx, &mut timelines, false, true)
+        self.initialize(ctx)
    }

    fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -446,6 +448,8 @@ pub enum DeleteTimelineError {
    NotFound,
    #[error("HasChildren")]
    HasChildren,
+    #[error("stop upload queue: {0:#}")]
+    StopUploadQueue(#[from] remote_timeline_client::StopError),
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -694,16 +698,9 @@ impl Tenant {
                        .await
                        .context("download index file")?;

-                    let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
-
                    debug!("finished index part download");

-                    Result::<_, anyhow::Error>::Ok((
-                        timeline_id,
-                        client,
-                        index_part,
-                        remote_metadata,
-                    ))
+                    Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part))
                }
                .map(move |res| {
                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
@@ -712,17 +709,26 @@ impl Tenant {
            );
        }
        // Wait for all the download tasks to complete & collect results.
-        let mut remote_clients = HashMap::new();
-        let mut index_parts = HashMap::new();
+        let mut remote_index_and_client = HashMap::new();
        let mut timeline_ancestors = HashMap::new();
        while let Some(result) = part_downloads.join_next().await {
            // NB: we already added timeline_id as context to the error
            let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
-            let (timeline_id, client, index_part, remote_metadata) = result?;
+            let (timeline_id, client, index_part) = result?;
            debug!("successfully downloaded index part for timeline {timeline_id}");
-            timeline_ancestors.insert(timeline_id, remote_metadata);
-            index_parts.insert(timeline_id, index_part);
-            remote_clients.insert(timeline_id, client);
+            match index_part {
+                MaybeDeletedIndexPart::IndexPart(index_part) => {
+                    timeline_ancestors.insert(
+                        timeline_id,
+                        index_part.parse_metadata().context("parse_metadata")?,
+                    );
+                    remote_index_and_client.insert(timeline_id, (index_part, client));
+                }
+                MaybeDeletedIndexPart::Deleted => {
+                    info!("timeline {} is deleted, skipping", timeline_id);
+                    continue;
+                }
+            }
        }

        // For every timeline, download the metadata file, scan the local directory,
@@ -730,12 +736,16 @@ impl Tenant {
        // layer file.
        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
        for (timeline_id, remote_metadata) in sorted_timelines {
+            let (index_part, remote_client) = remote_index_and_client
+                .remove(&timeline_id)
+                .expect("just put it in above");
+
            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
-                index_parts.remove(&timeline_id).unwrap(),
+                index_part,
                remote_metadata,
-                remote_clients.remove(&timeline_id).unwrap(),
+                remote_client,
                &ctx,
            )
            .await
@@ -1041,21 +1051,12 @@ impl Tenant {
    /// Subroutine of `load_tenant`, to load an individual timeline
    ///
    /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
    async fn load_local_timeline(
        &self,
        timeline_id: TimelineId,
        local_metadata: TimelineMetadata,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
-
        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
            RemoteTimelineClient::new(
                remote_storage.clone(),
@@ -1068,6 +1069,29 @@ impl Tenant {
        let remote_startup_data = match &remote_client {
            Some(remote_client) => match remote_client.download_index_file().await {
                Ok(index_part) => {
+                    let index_part = match index_part {
+                        MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+                        MaybeDeletedIndexPart::Deleted => {
+                            // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
+                            // Example:
+                            //  start deletion operation
+                            //  finishes upload of index part
+                            //  pageserver crashes
+                            //  remote storage gets de-configured
+                            //  pageserver starts
+                            //
+                            // We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
+                            // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
+                            info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler");
+                            std::fs::remove_dir_all(
+                                self.conf.timeline_path(&timeline_id, &self.tenant_id),
+                            )
+                            .context("remove_dir_all")?;
+
+                            return Ok(());
+                        }
+                    };
+
                    let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
                    Some(RemoteStartupData {
                        index_part,
@@ -1083,6 +1107,14 @@ impl Tenant {
            None => None,
        };

+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
+            Some(ancestor_timeline)
+        } else {
+            None
+        };
+
        self.timeline_init_and_sync(
            timeline_id,
            remote_client,
@@ -1370,9 +1402,37 @@ impl Tenant {
        timeline.walreceiver.stop().await;
        debug!("wal receiver shutdown confirmed");

+        // Prevent new uploads from starting.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            let res = remote_client.stop();
+            match &res {
+                Ok(()) => {}
+                Err(e) => match e {
+                    remote_timeline_client::StopError::QueueBroken => {
+                        // This happens if there's a panic inside above stop() call,
+                        // and we call stop() again after that.
+                        // The calling again can happen because we won't poison any
+                        // mutexes on the unwind path at the first panicking call.
+                    }
+                    remote_timeline_client::StopError::QueueUninitialized => {
+                        // This could happen if the timeline is Broken, e.g., because it failed to fetch IndexPart when it was loaded.
+                    }
+                },
+            }
+            res?;
+        }
+
+        // Stop & wait for the remaining timeline tasks, including upload tasks.
        info!("waiting for timeline tasks to shutdown");
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;

+        // Mark timeline as deleted in S3 so we wont pick it up next time
+        // during attach or pageserver restart.
+        // See comment in persist_index_part_with_deleted_flag.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            remote_client.persist_index_part_with_deleted_flag().await?;
+        }
+
        {
            // Grab the layer_removal_cs lock, and actually perform the deletion.
            //
@@ -1396,8 +1456,17 @@ impl Tenant {
            //     by the caller.

            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-            // with some layers missing.
+
+            fail::fail_point!("timeline-delete-before-rm", |_| {
+                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+            });
+
+            // NB: This need not be atomic because the deleted flag in the IndexPart
+            // will be observed during tenant/timeline load. The deletion will be resumed there.
+            //
+            // For configurations without remote storage, we tolerate that we're not crash-safe here.
+            // The timeline may come up Active but with missing layer files, in such setups.
+            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
                format!(
                    "Failed to remove local timeline directory '{}'",
@@ -1476,7 +1545,7 @@ impl Tenant {
                TenantState::Loading | TenantState::Attaching => {
                    *current_state = TenantState::Active;

-                    info!("Activating tenant {}", self.tenant_id);
+                    debug!(tenant_id = %self.tenant_id, "Activating tenant");

                    let timelines_accessor = self.timelines.lock().unwrap();
                    let not_broken_timelines = timelines_accessor
@@ -1487,12 +1556,17 @@ impl Tenant {
                    // down when they notice that the tenant is inactive.
                    tasks::start_background_loops(self.tenant_id);

+                    let mut activated_timelines = 0;
+                    let mut timelines_broken_during_activation = 0;
+
                    for timeline in not_broken_timelines {
                        match timeline
                            .activate(ctx)
                            .context("timeline activation for activating tenant")
                        {
-                            Ok(()) => {}
+                            Ok(()) => {
+                                activated_timelines += 1;
+                            }
                            Err(e) => {
                                error!(
                                    "Failed to activate timeline {}: {:#}",
@@ -1503,9 +1577,26 @@ impl Tenant {
                                    "failed to activate timeline {}: {}",
                                    timeline.timeline_id, e
                                ));
+
+                                timelines_broken_during_activation += 1;
                            }
                        }
                    }
+
+                    let elapsed = self.loading_started_at.elapsed();
+                    let total_timelines = timelines_accessor.len();
+
+                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                    info!(
+                        since_creation_millis = elapsed.as_millis(),
+                        tenant_id = %self.tenant_id,
+                        activated_timelines,
+                        timelines_broken_during_activation,
+                        total_timelines,
+                        post_state = <&'static str>::from(&*current_state),
+                        "activation attempt finished"
+                    );
                }
            }
        });
@@ -1812,6 +1903,9 @@ impl Tenant {
        Tenant {
            tenant_id,
            conf,
+            // using now here is good enough approximation to catch tenants with really long
+            // activation times.
+            loading_started_at: Instant::now(),
            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
            timelines: Mutex::new(HashMap::new()),
            gc_cs: tokio::sync::Mutex::new(()),
@@ -2326,8 +2420,6 @@ impl Tenant {
                )
            })?;

-        // Initialize the timeline without loading the layer map, because we already updated the layer
-        // map above, when we imported the datadir.
        let timeline = {
            let mut timelines = self.timelines.lock().unwrap();
            raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
@@ -2857,7 +2949,13 @@ pub mod harness {
            };

            LOG_HANDLE.get_or_init(|| {
-                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
+                logging::init(
+                    logging::LogFormat::Test,
+                    // enable it in case in case the tests exercise code paths that use
+                    // debug_assert_current_span_has_tenant_and_timeline_id
+                    logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+                )
+                .expect("Failed to init test logging")
            });

            let repo_dir = PageServerConf::test_repo_dir(test_name);
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -48,11 +48,10 @@ mod layer_coverage;

 use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
-use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::{bail, Result};
+use anyhow::Result;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -126,7 +125,7 @@ where
    ///
    /// Insert an on-disk layer.
    ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
        self.layer_map.insert_historic_noflush(layer)
    }

@@ -274,22 +273,16 @@ where
    ///
    /// Helper function for BatchedUpdates::insert_historic
    ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
-        let key = historic_layer_coverage::LayerKey::from(&*layer);
-        if self.historic.contains(&key) {
-            bail!(
-                "Attempt to insert duplicate layer {} in layer map",
-                layer.short_id()
-            );
-        }
-        self.historic.insert(key, Arc::clone(&layer));
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            Arc::clone(&layer),
+        );

        if Self::is_l0(&layer) {
            self.l0_delta_layers.push(layer);
        }
-
-        NUM_ONDISK_LAYERS.inc();
-        Ok(())
    }

    ///
@@ -314,8 +307,6 @@ where
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
-
-        NUM_ONDISK_LAYERS.dec();
    }

    pub(self) fn replace_historic_noflush(
@@ -843,7 +834,7 @@ mod tests {

            let expected_in_counts = (1, usize::from(expected_l0));

-            map.batch_update().insert_historic(remote.clone()).unwrap();
+            map.batch_update().insert_historic(remote.clone());
            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);

            let replaced = map
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -417,14 +417,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        }
    }

-    pub fn contains(&self, layer_key: &LayerKey) -> bool {
-        match self.buffer.get(layer_key) {
-            Some(None) => false,                         // layer remove was buffered
-            Some(_) => true,                             // layer insert was buffered
-            None => self.layers.contains_key(layer_key), // no buffered ops for this layer
-        }
-    }
-
    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
        self.buffer.insert(layer_key, Some(value));
    }
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,6 +12,7 @@ use std::io::Write;
 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
 use tracing::info_span;
+use utils::bin_ser::SerializeError;
 use utils::{
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
@@ -182,7 +183,7 @@ impl TimelineMetadata {
        }
    }

-    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+    pub fn to_bytes(&self) -> Result<Vec<u8>, SerializeError> {
        let body_bytes = self.body.ser()?;
        let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
        let hdr = TimelineMetadataHeader {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -204,8 +204,11 @@ mod download;
 pub mod index;
 mod upload;

+use anyhow::Context;
+use chrono::Utc;
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
+use scopeguard::ScopeGuard;

 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
@@ -213,13 +216,14 @@ use std::sync::{Arc, Mutex};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, info, warn};
+use tracing::{debug, error, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
-    REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
+    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
@@ -252,6 +256,22 @@ const FAILED_DOWNLOAD_RETRIES: u32 = 10;
 // retries. Uploads and deletions are retried forever, though.
 const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

+pub enum MaybeDeletedIndexPart {
+    IndexPart(IndexPart),
+    Deleted,
+}
+
+/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
+#[derive(Debug, thiserror::Error)]
+pub enum StopError {
+    /// Returned if the upload queue was never initialized.
+    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
+    #[error("queue is not initialized")]
+    QueueUninitialized,
+    #[error("queue is broken")]
+    QueueBroken,
+}
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -335,6 +355,7 @@ impl RemoteTimelineClient {
    pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
        match &*self.upload_queue.lock().unwrap() {
            UploadQueue::Uninitialized => None,
+            UploadQueue::Broken => None, // could we return something?
            UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
            UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
        }
@@ -366,12 +387,16 @@ impl RemoteTimelineClient {
    //

    /// Download index file
-    pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
-        let _unfinished_gauge_guard = self
-            .metrics
-            .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self.metrics.call_begin(
+            &RemoteOpFileKind::Index,
+            &RemoteOpKind::Download,
+            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                reason: "no need for a downloads gauge",
+            },
+        );

-        download::download_index_part(
+        let index_part = download::download_index_part(
            self.conf,
            &self.storage_impl,
            self.tenant_id,
@@ -384,7 +409,13 @@ impl RemoteTimelineClient {
            RemoteOpKind::Download,
            Arc::clone(&self.metrics),
        )
-        .await
+        .await?;
+
+        if index_part.deleted_at.is_some() {
+            Ok(MaybeDeletedIndexPart::Deleted)
+        } else {
+            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
+        }
    }

    /// Download a (layer) file from `path`, into local filesystem.
@@ -398,9 +429,13 @@ impl RemoteTimelineClient {
        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<u64> {
        let downloaded_size = {
-            let _unfinished_gauge_guard = self
-                .metrics
-                .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
+            let _unfinished_gauge_guard = self.metrics.call_begin(
+                &RemoteOpFileKind::Layer,
+                &RemoteOpKind::Download,
+                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                    reason: "no need for a downloads gauge",
+                },
+            );
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
@@ -615,6 +650,95 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Set the deleted_at field in the remote index file.
+    ///
+    /// This fails if the upload queue has not been `stop()`ed.
+    ///
+    /// The caller is responsible for calling `stop()` AND for waiting
+    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
+    /// Check method [`RemoteTimelineClient::stop`] for details.
+    pub(crate) async fn persist_index_part_with_deleted_flag(
+        self: &Arc<Self>,
+    ) -> anyhow::Result<()> {
+        let index_part_with_deleted_at = {
+            let mut locked = self.upload_queue.lock().unwrap();
+
+            // We must be in stopped state because otherwise
+            // we can have inprogress index part upload that can overwrite the file
+            // with missing is_deleted flag that we going to set below
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) | UploadQueue::Broken => {
+                    anyhow::bail!(
+                        "upload queue must be in state Stopped, but is in state {}",
+                        locked.as_str()
+                    );
+                }
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+
+            if let Some(deleted_at) = stopped.deleted_at.as_ref() {
+                anyhow::bail!("timeline is deleting, deleted_at: {:?}", deleted_at);
+            }
+            let deleted_at = Utc::now().naive_utc();
+            stopped.deleted_at = Some(deleted_at);
+
+            let mut index_part = IndexPart::new(
+                stopped.latest_files.clone(),
+                stopped.last_uploaded_consistent_lsn,
+                stopped
+                    .latest_metadata
+                    .to_bytes()
+                    .context("serialize metadata")?,
+            );
+            index_part.deleted_at = Some(deleted_at);
+            index_part
+        };
+
+        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
+            let mut locked = self_clone.upload_queue.lock().unwrap();
+            let stopped = match &mut *locked {
+                UploadQueue::Broken | UploadQueue::Uninitialized | UploadQueue::Initialized(_) => {
+                    unreachable!(
+                        "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                        locked.as_str(),
+                    )
+                }
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+            stopped.deleted_at = None;
+        });
+
+        #[cfg(feature = "testing")]
+        tokio::task::spawn_blocking({
+            let current = tracing::Span::current();
+            move || {
+                let _entered = current.entered();
+                tracing::info!(
+                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+                fail::fail_point!(
+                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+            }
+        })
+        .await
+        .expect("spawn_blocking");
+
+        upload::upload_index_part(
+            self.conf,
+            &self.storage_impl,
+            self.tenant_id,
+            self.timeline_id,
+            &index_part_with_deleted_at,
+        )
+        .await?;
+
+        // all good, keep the deleted_at flag
+        ScopeGuard::into_inner(undo_deleted_at);
+
+        Ok(())
+    }
+
    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
@@ -731,9 +855,19 @@ impl RemoteTimelineClient {
            // is cancellation safe, so we don't dare to do that. Hopefully, the
            // upload finishes or times out soon enough.
            if task_mgr::is_shutdown_requested() {
-                info!("upload task cancelled by shutdown request");
+                info!("upload task cancelled by shutdown request, stopping queue");
+                match self.stop() {
+                    Ok(()) => {}
+                    Err(StopError::QueueBroken) => {
+                        warn!("stop() observed upload queue as broken");
+                        // In this case, it's still ok to proceed with balancing out the metric and returning.
+                        // (The metric has nothing to do with the queue state itself).
+                    }
+                    Err(StopError::QueueUninitialized) => {
+                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
+                    }
+                }
                self.calls_unfinished_metric_end(&task.op);
-                self.stop();
                return;
            }

@@ -858,6 +992,10 @@ impl RemoteTimelineClient {
                    info!("another concurrent task already stopped the queue");
                    return;
                }, // nothing to do
+                UploadQueue::Broken => {
+                    warn!("the upload queue became broken while the task was running");
+                    return;
+                }
                UploadQueue::Initialized(qi) => { qi }
            };

@@ -886,11 +1024,32 @@ impl RemoteTimelineClient {
    fn calls_unfinished_metric_impl(
        &self,
        op: &UploadOp,
-    ) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
+    ) -> Option<(
+        RemoteOpFileKind,
+        RemoteOpKind,
+        RemoteTimelineClientMetricsCallTrackSize,
+    )> {
+        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
        let res = match op {
-            UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
-            UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
-            UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
+            UploadOp::UploadLayer(_, m) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Upload,
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+            ),
+            UploadOp::UploadMetadata(_, _) => (
+                RemoteOpFileKind::Index,
+                RemoteOpKind::Upload,
+                DontTrackSize {
+                    reason: "metadata uploads are tiny",
+                },
+            ),
+            UploadOp::Delete(file_kind, _) => (
+                *file_kind,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
            UploadOp::Barrier(_) => {
                // we do not account these
                return None;
@@ -900,80 +1059,101 @@ impl RemoteTimelineClient {
    }

    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
            Some(x) => x,
            None => return,
        };
-        let guard = self.metrics.call_begin(&file_kind, &op_kind);
+        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
    }

    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
            Some(x) => x,
            None => return,
        };
-        self.metrics.call_end(&file_kind, &op_kind);
+        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
    }

-    fn stop(&self) {
+    /// Close the upload queue for new operations and cancel queued operations.
+    /// In-progress operations will still be running after this function returns.
+    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// to wait for them to complete, after calling this function.
+    pub fn stop(&self) -> Result<(), StopError> {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
        // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
        // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
        let mut guard = self.upload_queue.lock().unwrap();
-        match &*guard {
-            UploadQueue::Uninitialized => panic!(
-                "callers are responsible for ensuring this is only called on initialized queue"
-            ),
+        // If any of the code below panics, the queue remains in Broken state.
+        // If we're coming from Initialized state, `queued_operations` will get dropped
+        // as part of the panic, because it sits in the local variable named `owned`.
+        // Any `wait_completion` operations against those queued operations
+        // will observe an error. That's exactly what we want.
+        // We don't need to care about in-progress operations because that responsibility
+        // lies with the caller. There's no point for them to try anything funky, like,
+        // catching the panic and retrying the stop() call. We will return QueueBroken in that case.
+        let owned = std::mem::replace(&mut *guard, UploadQueue::Broken);
+        let res;
+        *guard = match owned {
+            UploadQueue::Broken => {
+                res = Err(StopError::QueueBroken);
+                owned
+            }
+            UploadQueue::Uninitialized => {
+                res = Err(StopError::QueueUninitialized);
+                owned
+            }
            UploadQueue::Stopped(_) => {
                // nothing to do
                info!("another concurrent task already shut down the queue");
+                res = Ok(());
+                owned
            }
            UploadQueue::Initialized(qi) => {
-                info!("shutting down upload queue");
-
-                // Replace the queue with the Stopped state, taking ownership of the old
-                // Initialized queue. We will do some checks on it, and then drop it.
-                let qi = {
-                    let last_uploaded_consistent_lsn = qi.last_uploaded_consistent_lsn;
-                    let upload_queue = std::mem::replace(
-                        &mut *guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
-                            last_uploaded_consistent_lsn,
-                        }),
-                    );
-                    if let UploadQueue::Initialized(qi) = upload_queue {
-                        qi
-                    } else {
-                        unreachable!("we checked in the match above that it is Initialized");
-                    }
-                };
+                let UploadQueueInitialized {
+                    task_counter: _,
+                    latest_files,
+                    // XXX need to think about what it means if it's non-zero here
+                    latest_files_changes_since_metadata_upload_scheduled: _,
+                    latest_metadata,
+                    last_uploaded_consistent_lsn,
+                    num_inprogress_layer_uploads,
+                    num_inprogress_metadata_uploads,
+                    num_inprogress_deletions,
+                    inprogress_tasks,
+                    queued_operations,
+                } = qi;

                // consistency check
                assert_eq!(
-                    qi.num_inprogress_layer_uploads
-                        + qi.num_inprogress_metadata_uploads
-                        + qi.num_inprogress_deletions,
-                    qi.inprogress_tasks.len()
+                    num_inprogress_layer_uploads
+                        + num_inprogress_metadata_uploads
+                        + num_inprogress_deletions,
+                    inprogress_tasks.len()
                );

                // We don't need to do anything here for in-progress tasks. They will finish
                // on their own, decrement the unfinished-task counter themselves, and observe
                // that the queue is Stopped.
-                drop(qi.inprogress_tasks);
+                drop(inprogress_tasks);

                // Tear down queued ops
-                for op in qi.queued_operations.into_iter() {
+                for op in queued_operations.into_iter() {
                    self.calls_unfinished_metric_end(&op);
                    // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                    // which is exactly what we want to happen.
                    drop(op);
                }
-
-                // We're done.
-                drop(guard);
+                res = Ok(());
+                UploadQueue::Stopped(UploadQueueStopped {
+                    latest_files,
+                    last_uploaded_consistent_lsn,
+                    latest_metadata,
+                    deleted_at: None,
+                })
            }
-        }
+        };
+        res
    }
 }

@@ -981,11 +1161,19 @@ impl RemoteTimelineClient {
 mod tests {
    use super::*;
    use crate::{
-        tenant::harness::{TenantHarness, TIMELINE_ID},
+        context::RequestContext,
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            Tenant,
+        },
        DEFAULT_PG_VERSION,
    };
    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use std::{collections::HashSet, path::Path};
+    use std::{
+        collections::HashSet,
+        path::{Path, PathBuf},
+    };
+    use tokio::runtime::EnterGuard;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1034,39 +1222,80 @@ mod tests {
        assert_eq!(found, expected);
    }

+    struct TestSetup {
+        runtime: &'static tokio::runtime::Runtime,
+        entered_runtime: EnterGuard<'static>,
+        harness: TenantHarness<'static>,
+        tenant: Arc<Tenant>,
+        tenant_ctx: RequestContext,
+        remote_fs_dir: PathBuf,
+        client: Arc<RemoteTimelineClient>,
+    }
+
+    impl TestSetup {
+        fn new(test_name: &str) -> anyhow::Result<Self> {
+            // Use a current-thread runtime in the test
+            let runtime = Box::leak(Box::new(
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()?,
+            ));
+            let entered_runtime = runtime.enter();
+
+            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
+            let harness = TenantHarness::create(test_name)?;
+            let (tenant, ctx) = runtime.block_on(harness.load());
+            // create an empty timeline directory
+            let timeline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let _ = timeline.initialize(&ctx).unwrap();
+
+            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+            std::fs::create_dir_all(remote_fs_dir)?;
+            let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+
+            let storage_config = RemoteStorageConfig {
+                max_concurrent_syncs: std::num::NonZeroUsize::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                )
+                .unwrap(),
+                max_sync_errors: std::num::NonZeroU32::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                )
+                .unwrap(),
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            };
+
+            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+            let client = Arc::new(RemoteTimelineClient {
+                conf: harness.conf,
+                runtime,
+                tenant_id: harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                storage_impl: storage,
+                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &harness.tenant_id,
+                    &TIMELINE_ID,
+                )),
+            });
+
+            Ok(Self {
+                runtime,
+                entered_runtime,
+                harness,
+                tenant,
+                tenant_ctx: ctx,
+                remote_fs_dir,
+                client,
+            })
+        }
+    }
+
    // Test scheduling
    #[test]
    fn upload_scheduling() -> anyhow::Result<()> {
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
-        let harness = TenantHarness::create("upload_scheduling")?;
-        let (tenant, ctx) = runtime.block_on(harness.load());
-        let _timeline =
-            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-
-        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-
-        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-        };
-
        // Test outline:
        //
        // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1081,21 +1310,19 @@ mod tests {
        // Schedule another deletion. Check that it's launched immediately.
        // Schedule index upload. Check that it's queued

-        println!("workdir: {}", harness.conf.workdir.display());
-
-        let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
-        let client = Arc::new(RemoteTimelineClient {
-            conf: harness.conf,
+        let TestSetup {
            runtime,
-            tenant_id: harness.tenant_id,
-            timeline_id: TIMELINE_ID,
-            storage_impl,
-            upload_queue: Mutex::new(UploadQueue::Uninitialized),
-            metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                &harness.tenant_id,
-                &TIMELINE_ID,
-            )),
-        });
+            entered_runtime: _entered_runtime,
+            harness,
+            tenant: _tenant,
+            tenant_ctx: _tenant_ctx,
+            remote_fs_dir,
+            client,
+        } = TestSetup::new("upload_scheduling").unwrap();
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        println!("workdir: {}", harness.conf.workdir.display());

        let remote_timeline_dir =
            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
@@ -1163,7 +1390,11 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = runtime.block_on(client.download_index_file())?;
+        let index_part = match runtime.block_on(client.download_index_file())? {
+            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+        };
+
        assert_file_list(
            &index_part.timeline_layers,
            &[
@@ -1216,4 +1447,90 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+        // Setup
+
+        let TestSetup {
+            runtime,
+            harness,
+            client,
+            ..
+        } = TestSetup::new("metrics")?;
+
+        let metadata = dummy_metadata(Lsn(0x10));
+        client.init_upload_queue_for_empty_remote(&metadata)?;
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        std::fs::write(
+            timeline_path.join(layer_file_name_1.file_name()),
+            &content_1,
+        )?;
+
+        #[derive(Debug, PartialEq)]
+        struct BytesStartedFinished {
+            started: Option<usize>,
+            finished: Option<usize>,
+        }
+        let get_bytes_started_stopped = || {
+            let started = client
+                .metrics
+                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            let stopped = client
+                .metrics
+                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            BytesStartedFinished {
+                started,
+                finished: stopped,
+            }
+        };
+
+        // Test
+
+        let init = get_bytes_started_stopped();
+
+        client.schedule_layer_file_upload(
+            &layer_file_name_1,
+            &LayerFileMetadata::new(content_1.len() as u64),
+        )?;
+
+        let pre = get_bytes_started_stopped();
+
+        runtime.block_on(client.wait_completion())?;
+
+        let post = get_bytes_started_stopped();
+
+        // Validate
+
+        assert_eq!(
+            init,
+            BytesStartedFinished {
+                started: None,
+                finished: None
+            }
+        );
+        assert_eq!(
+            pre,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                // assert that the _finished metric is created eagerly so that subtractions work on first sample
+                finished: Some(0),
+            }
+        );
+        assert_eq!(
+            post,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len())
+            }
+        );
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,6 +16,7 @@ use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
@@ -43,6 +44,8 @@ pub async fn download_layer_file<'a>(
    layer_file_name: &'a LayerFileName,
    layer_metadata: &'a LayerFileMetadata,
 ) -> Result<u64, DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);

    let local_path = timeline_path.join(layer_file_name.file_name());
@@ -154,7 +157,7 @@ pub async fn download_layer_file<'a>(
        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

-    tracing::info!("download complete: {}", local_path.display());
+    tracing::debug!("download complete: {}", local_path.display());

    Ok(bytes_amount)
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -4,6 +4,7 @@

 use std::collections::{HashMap, HashSet};

+use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};

@@ -55,6 +56,10 @@ pub struct IndexPart {
    #[serde(default)]
    version: usize,

+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub deleted_at: Option<NaiveDateTime>,
+
    /// Layer names, which are stored on the remote storage.
    ///
    /// Additional metadata can might exist in `layer_metadata`.
@@ -78,7 +83,7 @@ impl IndexPart {
    /// used to understand later versions.
    ///
    /// Version is currently informative only.
-    const LATEST_VERSION: usize = 1;
+    const LATEST_VERSION: usize = 2;
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
@@ -101,6 +106,7 @@ impl IndexPart {
            layer_metadata,
            disk_consistent_lsn,
            metadata_bytes,
+            deleted_at: None,
        }
    }

@@ -156,6 +162,7 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
        };

        let part = serde_json::from_str::<IndexPart>(example).unwrap();
@@ -192,6 +199,7 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
        };

        let part = serde_json::from_str::<IndexPart>(example).unwrap();
@@ -236,6 +244,7 @@ mod tests {
                0, 0,
            ]
            .to_vec(),
+            deleted_at: None,
        };

        let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -19,9 +19,12 @@ pub(super) async fn upload_index_part<'a>(
    timeline_id: TimelineId,
    index_part: &'a IndexPart,
 ) -> anyhow::Result<()> {
+    tracing::trace!("uploading new index part");
+
    fail_point!("before-upload-index", |_| {
        bail!("failpoint before-upload-index")
    });
+
    let index_part_bytes = serde_json::to_vec(&index_part)
        .context("Failed to serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
@@ -31,6 +34,7 @@ pub(super) async fn upload_index_part<'a>(
        .metadata_path(timeline_id, tenant_id)
        .with_file_name(IndexPart::FILE_NAME);
    let storage_path = conf.remote_path(&index_part_path)?;
+
    storage
        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
        .await
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -48,7 +48,7 @@ use crate::tenant::{

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
@@ -588,15 +588,25 @@ impl Timeline {

        let _timer = self.metrics.wait_lsn_time_histo.start_timer();

-        self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await
-            .with_context(||
-                format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
-                    lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
-                )
-            )?;
-
-        Ok(())
+        match self
+            .last_record_lsn
+            .wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
+            .await
+        {
+            Ok(()) => Ok(()),
+            seqwait_error => {
+                drop(_timer);
+                let walreceiver_status = self.walreceiver.status().await;
+                seqwait_error.with_context(|| format!(
+                    "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}",
+                    lsn,
+                    self.get_last_record_lsn(),
+                    self.get_disk_consistent_lsn(),
+                    walreceiver_status.map(|status| status.to_human_readable_string())
+                            .unwrap_or_else(|| "WalReceiver status: Not active".to_string()),
+                ))
+            }
+        }
    }

    /// Check that it is valid to request operations with that lsn.
@@ -936,6 +946,7 @@ impl Timeline {
        }
    }

+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
    pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
        let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) };
        let Some(remote_layer) = layer.downcast_remote_layer() else { return  Ok(Some(false)) };
@@ -1483,7 +1494,7 @@ impl Timeline {

                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                num_layers += 1;
            } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                // Create a DeltaLayer struct for each delta file.
@@ -1515,7 +1526,7 @@ impl Timeline {

                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                num_layers += 1;
            } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                // ignore these
@@ -1589,7 +1600,7 @@ impl Timeline {
            // remote index file?
            // If so, rename_to_backup those files & replace their local layer with
            // a RemoteLayer in the layer map so that we re-download them on-demand.
-            if let Some(local_layer) = &local_layer {
+            if let Some(local_layer) = local_layer {
                let local_layer_path = local_layer
                    .local_path()
                    .expect("caller must ensure that local_layers only contains local layers");
@@ -1614,6 +1625,7 @@ impl Timeline {
                        anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                    } else {
                        self.metrics.resident_physical_size_gauge.sub(local_size);
+                        updates.remove_historic(local_layer);
                        // fall-through to adding the remote layer
                    }
                } else {
@@ -1649,11 +1661,7 @@ impl Timeline {
                    );
                    let remote_layer = Arc::new(remote_layer);

-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                }
                LayerFileName::Delta(deltafilename) => {
                    // Create a RemoteLayer for the delta file.
@@ -1677,11 +1685,7 @@ impl Timeline {
                        LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted),
                    );
                    let remote_layer = Arc::new(remote_layer);
-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                }
            }
        }
@@ -2355,6 +2359,7 @@ impl Timeline {
                            id,
                            ctx.task_kind()
                        );
+                        UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
                        timeline.download_remote_layer(remote_layer).await?;
                        continue 'layer_map_search;
                    }
@@ -2728,7 +2733,7 @@ impl Timeline {
            .write()
            .unwrap()
            .batch_update()
-            .insert_historic(Arc::new(new_delta))?;
+            .insert_historic(Arc::new(new_delta));

        // update the timeline's physical size
        let sz = new_delta_path.metadata()?.len();
@@ -2933,7 +2938,7 @@ impl Timeline {
            self.metrics
                .resident_physical_size_gauge
                .add(metadata.len());
-            updates.insert_historic(Arc::new(l))?;
+            updates.insert_historic(Arc::new(l));
        }
        updates.flush();
        drop(layers);
@@ -3366,7 +3371,7 @@ impl Timeline {

            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
            let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            updates.insert_historic(x)?;
+            updates.insert_historic(x);
        }

        // Now that we have reshuffled the data to set of new delta layers, we can
@@ -3818,11 +3823,13 @@ impl Timeline {
    /// If the caller has a deadline or needs a timeout, they can simply stop polling:
    /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
    /// So, the current download attempt will run to completion even if we stop polling.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
+    #[instrument(skip_all, fields(layer=%remote_layer.short_id()))]
    pub async fn download_remote_layer(
        &self,
        remote_layer: Arc<RemoteLayer>,
    ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        use std::sync::atomic::Ordering::Relaxed;

        let permit = match Arc::clone(&remote_layer.ongoing_download)
@@ -3866,6 +3873,8 @@ impl Timeline {
                    .await;

                if let Ok(size) = &result {
+                    info!("layer file download finished");
+
                    // XXX the temp file is still around in Err() case
                    // and consumes space until we clean up upon pageserver restart.
                    self_clone.metrics.resident_physical_size_gauge.add(*size);
@@ -3937,6 +3946,8 @@ impl Timeline {
                    updates.flush();
                    drop(layers);

+                    info!("on-demand download successful");
+
                    // Now that we've inserted the download into the layer map,
                    // close the semaphore. This will make other waiters for
                    // this download return Ok(()).
@@ -3944,7 +3955,7 @@ impl Timeline {
                    remote_layer.ongoing_download.close();
                } else {
                    // Keep semaphore open. We'll drop the permit at the end of the function.
-                    error!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
+                    error!("layer file download failed: {:?}", result.as_ref().unwrap_err());
                }

                // Don't treat it as an error if the task that triggered the download
@@ -4255,3 +4266,36 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {

    bail!("couldn't find an unused backup number for {:?}", path)
 }
+
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    use utils::tracing_span_assert;
+
+    pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
+    });
+
+    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+    });
+
+    match tracing_span_assert::check_fields_present([
+        &*TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -38,12 +38,14 @@ use std::sync::{Arc, Weak};
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
 use tokio::select;
-use tokio::sync::watch;
+use tokio::sync::{watch, RwLock};
 use tokio_util::sync::CancellationToken;
 use tracing::*;

 use utils::id::TenantTimelineId;

+use self::connection_manager::ConnectionManagerStatus;
+
 use super::Timeline;

 #[derive(Clone)]
@@ -63,6 +65,7 @@ pub struct WalReceiver {
    timeline_ref: Weak<Timeline>,
    conf: WalReceiverConf,
    started: AtomicBool,
+    manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
 }

 impl WalReceiver {
@@ -76,6 +79,7 @@ impl WalReceiver {
            timeline_ref,
            conf,
            started: AtomicBool::new(false),
+            manager_status: Arc::new(RwLock::new(None)),
        }
    }

@@ -96,8 +100,8 @@ impl WalReceiver {
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-
        let wal_receiver_conf = self.conf.clone();
+        let loop_status = Arc::clone(&self.manager_status);
        task_mgr::spawn(
            WALRECEIVER_RUNTIME.handle(),
            TaskKind::WalReceiverManager,
@@ -115,24 +119,28 @@ impl WalReceiver {
                    select! {
                        _ = task_mgr::shutdown_watcher() => {
                            info!("WAL receiver shutdown requested, shutting down");
-                            connection_manager_state.shutdown().await;
-                            return Ok(());
+                            break;
                        },
                        loop_step_result = connection_manager_loop_step(
                            &mut broker_client,
                            &mut connection_manager_state,
                            &walreceiver_ctx,
+                            &loop_status,
                        ) => match loop_step_result {
                            ControlFlow::Continue(()) => continue,
                            ControlFlow::Break(()) => {
                                info!("Connection manager loop ended, shutting down");
-                                connection_manager_state.shutdown().await;
-                                return Ok(());
+                                break;
                            }
                        },
                    }
                }
-            }.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
+
+                connection_manager_state.shutdown().await;
+                *loop_status.write().await = None;
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
        );

        self.started.store(true, atomic::Ordering::Release);
@@ -149,6 +157,10 @@ impl WalReceiver {
        .await;
        self.started.store(false, atomic::Ordering::Release);
    }
+
+    pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
+        self.manager_status.read().await.clone()
+    }
 }

 /// A handle of an asynchronous task.
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -13,6 +13,10 @@ use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, ti

 use super::{TaskStateUpdate, WalReceiverConf};
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{
+    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
+    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
+};
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
 use anyhow::Context;
@@ -24,6 +28,7 @@ use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::BrokerClientChannel;
 use storage_broker::Streaming;
+use tokio::sync::RwLock;
 use tokio::{select, sync::watch};
 use tracing::*;

@@ -43,6 +48,7 @@ pub(super) async fn connection_manager_loop_step(
    broker_client: &mut BrokerClientChannel,
    connection_manager_state: &mut ConnectionManagerState,
    ctx: &RequestContext,
+    manager_status: &RwLock<Option<ConnectionManagerStatus>>,
 ) -> ControlFlow<(), ()> {
    let mut timeline_state_updates = connection_manager_state
        .timeline
@@ -56,6 +62,11 @@ pub(super) async fn connection_manager_loop_step(
        }
    }

+    WALRECEIVER_ACTIVE_MANAGERS.inc();
+    scopeguard::defer! {
+        WALRECEIVER_ACTIVE_MANAGERS.dec();
+    }
+
    let id = TenantTimelineId {
        tenant_id: connection_manager_state.timeline.tenant_id,
        timeline_id: connection_manager_state.timeline.timeline_id,
@@ -180,6 +191,7 @@ pub(super) async fn connection_manager_loop_step(
                .change_connection(new_candidate, ctx)
                .await
        }
+        *manager_status.write().await = Some(connection_manager_state.manager_status());
    }
 }

@@ -267,6 +279,78 @@ pub(super) struct ConnectionManagerState {
    wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
 }

+/// An information about connection manager's current connection and connection candidates.
+#[derive(Debug, Clone)]
+pub struct ConnectionManagerStatus {
+    existing_connection: Option<WalConnectionStatus>,
+    wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
+}
+
+impl ConnectionManagerStatus {
+    /// Generates a string, describing current connection status in a form, suitable for logging.
+    pub fn to_human_readable_string(&self) -> String {
+        let mut resulting_string = "WalReceiver status".to_string();
+        match &self.existing_connection {
+            Some(connection) => {
+                if connection.has_processed_wal {
+                    resulting_string.push_str(&format!(
+                        " (update {}): streaming WAL from node {}, ",
+                        connection.latest_wal_update.format("%Y-%m-%d %H:%M:%S"),
+                        connection.node,
+                    ));
+
+                    match (connection.streaming_lsn, connection.commit_lsn) {
+                        (None, None) => resulting_string.push_str("no streaming data"),
+                        (None, Some(commit_lsn)) => {
+                            resulting_string.push_str(&format!("commit Lsn: {commit_lsn}"))
+                        }
+                        (Some(streaming_lsn), None) => {
+                            resulting_string.push_str(&format!("streaming Lsn: {streaming_lsn}"))
+                        }
+                        (Some(streaming_lsn), Some(commit_lsn)) => resulting_string.push_str(
+                            &format!("commit|streaming Lsn: {commit_lsn}|{streaming_lsn}"),
+                        ),
+                    }
+                } else if connection.is_connected {
+                    resulting_string.push_str(&format!(
+                        " (update {}): connecting to node {}",
+                        connection
+                            .latest_connection_update
+                            .format("%Y-%m-%d %H:%M:%S"),
+                        connection.node,
+                    ));
+                } else {
+                    resulting_string.push_str(&format!(
+                        " (update {}): initializing node {} connection",
+                        connection
+                            .latest_connection_update
+                            .format("%Y-%m-%d %H:%M:%S"),
+                        connection.node,
+                    ));
+                }
+            }
+            None => resulting_string.push_str(": disconnected"),
+        }
+
+        resulting_string.push_str(", safekeeper candidates (id|update_time|commit_lsn): [");
+        let mut candidates = self.wal_stream_candidates.iter().peekable();
+        while let Some((node_id, candidate_info)) = candidates.next() {
+            resulting_string.push_str(&format!(
+                "({}|{}|{})",
+                node_id,
+                candidate_info.latest_update.format("%H:%M:%S"),
+                Lsn(candidate_info.timeline.commit_lsn)
+            ));
+            if candidates.peek().is_some() {
+                resulting_string.push_str(", ");
+            }
+        }
+        resulting_string.push(']');
+
+        resulting_string
+    }
+}
+
 /// Current connection data.
 #[derive(Debug)]
 struct WalConnection {
@@ -293,14 +377,14 @@ struct NewCommittedWAL {
    discovered_at: NaiveDateTime,
 }

-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 struct RetryInfo {
    next_retry_at: Option<NaiveDateTime>,
    retry_duration_seconds: f64,
 }

 /// Data about the timeline to connect to, received from the broker.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 struct BrokerSkTimeline {
    timeline: SafekeeperTimelineInfo,
    /// Time at which the data was fetched from the broker last time, to track the stale data.
@@ -325,9 +409,14 @@ impl ConnectionManagerState {

    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
+        WALRECEIVER_SWITCHES
+            .with_label_values(&[new_sk.reason.name()])
+            .inc();
+
        self.drop_old_connection(true).await;

        let id = self.id;
+        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
        let timeline = Arc::clone(&self.timeline);
        let ctx = ctx.detached_child(
@@ -343,12 +432,13 @@ impl ConnectionManagerState {
                    cancellation,
                    connect_timeout,
                    ctx,
+                    node_id,
                )
                .await
                .context("walreceiver connection handling failure")
            }
            .instrument(
-                info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id),
+                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
            )
        });

@@ -364,6 +454,7 @@ impl ConnectionManagerState {
                latest_wal_update: now,
                streaming_lsn: None,
                commit_lsn: None,
+                node: node_id,
            },
            connection_task: connection_handle,
            discovered_new_wal: None,
@@ -437,6 +528,8 @@ impl ConnectionManagerState {

    /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
    fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
+        WALRECEIVER_BROKER_UPDATES.inc();
+
        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
        let old_entry = self.wal_stream_candidates.insert(
            new_safekeeper_id,
@@ -448,6 +541,7 @@ impl ConnectionManagerState {

        if old_entry.is_none() {
            info!("New SK node was added: {new_safekeeper_id}");
+            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }

@@ -716,6 +810,7 @@ impl ConnectionManagerState {
            for node_id in node_ids_to_remove {
                info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
                self.wal_connection_retries.remove(&node_id);
+                WALRECEIVER_CANDIDATES_REMOVED.inc();
            }
        }
    }
@@ -725,6 +820,13 @@ impl ConnectionManagerState {
            wal_connection.connection_task.shutdown().await;
        }
    }
+
+    fn manager_status(&self) -> ConnectionManagerStatus {
+        ConnectionManagerStatus {
+            existing_connection: self.wal_connection.as_ref().map(|conn| conn.status),
+            wal_stream_candidates: self.wal_stream_candidates.clone(),
+        }
+    }
 }

 #[derive(Debug)]
@@ -732,8 +834,6 @@ struct NewWalConnectionCandidate {
    safekeeper_id: NodeId,
    wal_source_connconf: PgConnectionConfig,
    availability_zone: Option<String>,
-    // This field is used in `derive(Debug)` only.
-    #[allow(dead_code)]
    reason: ReconnectReason,
 }

@@ -762,6 +862,18 @@ enum ReconnectReason {
    },
 }

+impl ReconnectReason {
+    fn name(&self) -> &str {
+        match self {
+            ReconnectReason::NoExistingConnection => "NoExistingConnection",
+            ReconnectReason::LaggingWal { .. } => "LaggingWal",
+            ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone",
+            ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout",
+            ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives",
+        }
+    }
+}
+
 fn wal_stream_connection_config(
    TenantTimelineId {
        tenant_id,
@@ -867,6 +979,7 @@ mod tests {
            latest_wal_update: now,
            commit_lsn: Some(Lsn(current_lsn)),
            streaming_lsn: Some(Lsn(current_lsn)),
+            node: NodeId(1),
        };

        state.conf.max_lsn_wal_lag = NonZeroU64::new(100).unwrap();
@@ -1035,6 +1148,7 @@ mod tests {
            latest_wal_update: now,
            commit_lsn: Some(current_lsn),
            streaming_lsn: Some(current_lsn),
+            node: connected_sk_id,
        };

        state.wal_connection = Some(WalConnection {
@@ -1101,6 +1215,7 @@ mod tests {
            latest_wal_update: time_over_threshold,
            commit_lsn: Some(current_lsn),
            streaming_lsn: Some(current_lsn),
+            node: NodeId(1),
        };

        state.wal_connection = Some(WalConnection {
@@ -1164,6 +1279,7 @@ mod tests {
            latest_wal_update: time_over_threshold,
            commit_lsn: Some(current_lsn),
            streaming_lsn: Some(current_lsn),
+            node: NodeId(1),
        };

        state.wal_connection = Some(WalConnection {
@@ -1261,6 +1377,7 @@ mod tests {
            latest_wal_update: now,
            commit_lsn: Some(current_lsn),
            streaming_lsn: Some(current_lsn),
+            node: connected_sk_id,
        };

        state.wal_connection = Some(WalConnection {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -24,8 +24,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};

 use super::TaskStateUpdate;
-use crate::context::RequestContext;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
 use crate::{
    task_mgr,
    task_mgr::TaskKind,
@@ -37,8 +37,8 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use pq_proto::PageserverFeedback;
-use utils::lsn::Lsn;
+use utils::pageserver_feedback::PageserverFeedback;
+use utils::{id::NodeId, lsn::Lsn};

 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -56,6 +56,8 @@ pub(super) struct WalConnectionStatus {
    pub streaming_lsn: Option<Lsn>,
    /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet.
    pub commit_lsn: Option<Lsn>,
+    /// The node it is connected to
+    pub node: NodeId,
 }

 /// Open a connection to the given safekeeper and receive WAL, sending back progress
@@ -67,7 +69,10 @@ pub(super) async fn handle_walreceiver_connection(
    cancellation: CancellationToken,
    connect_timeout: Duration,
    ctx: RequestContext,
+    node: NodeId,
 ) -> anyhow::Result<()> {
+    WALRECEIVER_STARTED_CONNECTIONS.inc();
+
    // Connect to the database in replication mode.
    info!("connecting to {wal_source_connconf:?}");

@@ -100,6 +105,7 @@ pub(super) async fn handle_walreceiver_connection(
        latest_wal_update: Utc::now().naive_utc(),
        streaming_lsn: None,
        commit_lsn: None,
+        node,
    };
    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
        warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
@@ -122,7 +128,7 @@ pub(super) async fn handle_walreceiver_connection(
        false,
        async move {
            select! {
-                connection_result = connection => match connection_result{
+                connection_result = connection => match connection_result {
                    Ok(()) => info!("Walreceiver db connection closed"),
                    Err(connection_error) => {
                        if let Err(e) = ignore_expected_errors(connection_error) {
@@ -319,12 +325,12 @@ pub(super) async fn handle_walreceiver_connection(
                timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
-            let last_received_lsn = u64::from(last_lsn);
+            let last_received_lsn = last_lsn;
            // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
-            let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn());
+            let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
            // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
            // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
-            let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn);
+            let remote_consistent_lsn = timeline_remote_consistent_lsn;
            let ts = SystemTime::now();

            // Update the status about what we just received. This is shown in the mgmt API.
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -7,6 +7,7 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

+use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;

@@ -18,18 +19,20 @@ use utils::lsn::Lsn;
 // that many upload queues in a running pageserver, and most of them are initialized
 // anyway.
 #[allow(clippy::large_enum_variant)]
-pub(crate) enum UploadQueue {
+pub(super) enum UploadQueue {
    Uninitialized,
    Initialized(UploadQueueInitialized),
    Stopped(UploadQueueStopped),
+    Broken,
 }

 impl UploadQueue {
-    fn as_str(&self) -> &'static str {
+    pub fn as_str(&self) -> &'static str {
        match self {
            UploadQueue::Uninitialized => "Uninitialized",
            UploadQueue::Initialized(_) => "Initialized",
            UploadQueue::Stopped(_) => "Stopped",
+            UploadQueue::Broken => "Broken",
        }
    }
 }
@@ -75,8 +78,12 @@ pub(crate) struct UploadQueueInitialized {
    pub(crate) queued_operations: VecDeque<UploadOp>,
 }

-pub(crate) struct UploadQueueStopped {
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+pub(super) struct UploadQueueStopped {
+    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(super) last_uploaded_consistent_lsn: Lsn,
+    pub(super) latest_metadata: TimelineMetadata,
+    /// If Some(), a call to `persist_index_part_with_deleted_flag` is ongoing or finished.
+    pub(super) deleted_at: Option<NaiveDateTime>,
 }

 impl UploadQueue {
@@ -86,7 +93,7 @@ impl UploadQueue {
    ) -> anyhow::Result<&mut UploadQueueInitialized> {
        match self {
            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) | UploadQueue::Broken => {
                anyhow::bail!("already initialized, state {}", self.as_str())
            }
        }
@@ -120,7 +127,7 @@ impl UploadQueue {
    ) -> anyhow::Result<&mut UploadQueueInitialized> {
        match self {
            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) | UploadQueue::Broken => {
                anyhow::bail!("already initialized, state {}", self.as_str())
            }
        }
@@ -170,7 +177,7 @@ impl UploadQueue {

    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+            UploadQueue::Broken | UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
                anyhow::bail!("queue is in state {}", self.as_str())
            }
            UploadQueue::Initialized(x) => Ok(x),
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -96,6 +96,8 @@ static shmem_request_hook_type prev_shmem_request_hook;
 #endif
 static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */

+void FileCacheMonitorMain(Datum main_arg);
+
 static void
 lfc_shmem_startup(void)
 {
@@ -370,6 +372,73 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	return found;
 }

+/*
+ * Evict a page (if present) from the local file cache
+ */
+void
+lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return;
+
+	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
+
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found);
+
+	if (!found)
+	{
+		/* nothing to do */
+		LWLockRelease(lfc_lock);
+		return;
+	}
+
+	/* remove the page from the cache */
+	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
+
+	/*
+	 * If the chunk has no live entries, we can position the chunk to be
+	 * recycled first.
+	 */
+	if (entry->bitmap[chunk_offs >> 5] == 0)
+	{
+		bool has_remaining_pages;
+
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
+			if (entry->bitmap[i] != 0)
+			{
+				has_remaining_pages = true;
+				break;
+			}
+		}
+
+		/*
+		 * Put the entry at the position that is first to be reclaimed when
+		 * we have no cached pages remaining in the chunk
+		 */
+		if (!has_remaining_pages)
+		{
+			dlist_delete(&entry->lru_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
+		}
+	}
+
+	/*
+	 * Done: apart from empty chunks, we don't move chunks in the LRU when
+	 * they're empty because eviction isn't usage.
+	 */
+
+	LWLockRelease(lfc_lock);
+}
+
 /*
 * Try to read page from local cache.
 * Returns true if page is found in local cache.
@@ -528,7 +597,6 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	LWLockRelease(lfc_lock);
 }

-
 /*
 * Record structure holding the to be exposed cache data.
 */
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -17,6 +17,8 @@
 #include "pagestore_client.h"
 #include "fmgr.h"
 #include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "storage/buf_internals.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -57,6 +59,8 @@ int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
 int			readahead_buffer_size = 128;

+bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
 static void pageserver_flush(void);

 static bool
@@ -467,6 +471,8 @@ pg_init_libpagestore(void)
 		smgr_hook = smgr_neon;
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
+		old_redo_read_buffer_filter = redo_read_buffer_filter;
+		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 	lfc_init();
 }
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -24,6 +24,7 @@

 #include "neon.h"
 #include "walproposer.h"
+#include "pagestore_client.h"

 PG_MODULE_MAGIC;
 void		_PG_init(void);
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -11,6 +11,7 @@

 #ifndef NEON_H
 #define NEON_H
+#include "access/xlogreader.h"

 /* GUCs */
 extern char *neon_auth_token;
@@ -20,4 +21,11 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

+/*
+ * Returns true if we shouldn't do REDO on that block in record indicated by
+ * block_id; false otherwise.
+ */
+extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 #endif							/* NEON_H */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -207,6 +207,7 @@ extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
 extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
 extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);


--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -189,6 +189,7 @@ typedef struct PrfHashEntry {
 #define SH_DEFINE
 #define SH_DECLARE
 #include "lib/simplehash.h"
+#include "neon.h"

 /*
 * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
@@ -1209,6 +1210,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch

 	if (ShutdownRequestPending)
 		return;
+	/* Don't log any pages if we're not allowed to do so. */
+	if (!XLogInsertAllowed())
+		return;

 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1375,8 +1379,18 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN

 	if (RecoveryInProgress())
 	{
+		/*
+		 * We don't know if WAL has been generated but not yet replayed, so
+		 * we're conservative in our estimates about latest pages.
+		 */
 		*latest = false;
-		lsn = GetXLogReplayRecPtr(NULL);
+
+		/*
+		 * Get the last written LSN of this page.
+		 */
+		lsn = GetLastWrittenLSN(rnode, forknum, blkno);
+		lsn = nm_adjust_lsn(lsn);
+
 		elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
@@ -1559,6 +1573,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	/*
 	 * Newly created relation is empty, remember that in the relsize cache.
 	 *
+	 * Note that in REDO, this is called to make sure the relation fork exists,
+	 * but it does not truncate the relation. So, we can only update the
+	 * relsize if it didn't exist before.
+	 * 
+	 * Also, in redo, we must make sure to update the cached size of the
+	 * relation, as that is the primary source of truth for REDO's
+	 * file length considerations, and as file extension isn't (perfectly)
+	 * logged, we need to take care of that before we hit file size checks.
+	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
 	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
@@ -1566,7 +1589,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 * cache, we might call smgrnblocks() on the newly-created relation before
 	 * the creation WAL record hass been received by the page server.
 	 */
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+	if (isRedo)
+	{
+		update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+		get_cached_relsize(reln->smgr_rnode.node, forkNum,
+						   &reln->smgr_cached_nblocks[forkNum]);
+	}
+	else
+		set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1831,6 +1861,26 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		.blockNum = blkno,
 	};

+	/*
+	 * The redo process does not lock pages that it needs to replay but are
+	 * not in the shared buffers, so a concurrent process may request the
+	 * page after redo has decided it won't redo that page and updated the
+	 * LwLSN for that page.
+	 * If we're in hot standby we need to take care that we don't return
+	 * until after REDO has finished replaying up to that LwLSN, as the page
+	 * should have been locked up to that point.
+	 *
+	 * See also the description on neon_redo_read_buffer_filter below.
+	 *
+	 * NOTE: It is possible that the WAL redo process will still do IO due to
+	 * concurrent failed read IOs. Those IOs should never have a request_lsn
+	 * that is as large as the WAL record we're currently replaying, if it
+	 * weren't for the behaviour of the LwLsn cache that uses the highest
+	 * value of the LwLsn cache when the entry is not found. 
+	 */
+	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
+		XLogWaitForReplayOf(request_lsn);
+
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
@@ -2584,3 +2634,143 @@ smgr_init_neon(void)
 	smgr_init_standard();
 	neon_init();
 }
+
+
+/*
+ * Return whether we can skip the redo for this block.
+ * 
+ * The conditions for skipping the IO are:
+ *
+ * - The block is not in the shared buffers, and
+ * - The block is not in the local file cache
+ *
+ * ... because any subsequent read of the page requires us to read
+ * the new version of the page from the PageServer. We do not
+ * check the local file cache; we instead evict the page from LFC: it
+ * is cheaper than going through the FS calls to read the page, and
+ * limits the number of lock operations used in the REDO process.
+ *
+ * We have one exception to the rules for skipping IO: We always apply
+ * changes to shared catalogs' pages. Although this is mostly out of caution,
+ * catalog updates usually result in backends rebuilding their catalog snapshot,
+ * which means it's quite likely the modified page is going to be used soon.
+ *
+ * It is important to note that skipping WAL redo for a page also means
+ * the page isn't locked by the redo process, as there is no Buffer
+ * being returned, nor is there a buffer descriptor to lock.
+ * This means that any IO that wants to read this block needs to wait
+ * for the WAL REDO process to finish processing the WAL record before
+ * it allows the system to start reading the block, as releasing the
+ * block early could lead to phantom reads.
+ *
+ * For example, REDO for a WAL record that modifies 3 blocks could skip
+ * the first block, wait for a lock on the second, and then modify the
+ * third block. Without skipping, all blocks would be locked and phantom
+ * reads would not occur, but with skipping, a concurrent process could
+ * read block 1 with post-REDO contents and read block 3 with pre-REDO
+ * contents, where with REDO locking it would wait on block 1 and see
+ * block 3 with post-REDO contents only.
+ */
+bool
+neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
+{
+	XLogRecPtr	end_recptr = record->EndRecPtr;
+	XLogRecPtr	prev_end_recptr = record->ReadRecPtr - 1;
+	RelFileNode	rnode;
+	ForkNumber	forknum;
+	BlockNumber	blkno;
+	BufferTag	tag;
+	uint32		hash;
+	LWLock	   *partitionLock;
+	Buffer		buffer;
+	bool		no_redo_needed;
+	BlockNumber relsize;
+
+	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
+		return true;
+
+#if PG_VERSION_NUM < 150000
+	if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+#else
+	XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno);
+#endif
+
+	/*
+	 * Out of an abundance of caution, we always run redo on shared catalogs,
+	 * regardless of whether the block is stored in shared buffers.
+	 * See also this function's top comment.
+	 */
+	if (!OidIsValid(rnode.dbNode))
+		return false;
+
+	INIT_BUFFERTAG(tag, rnode, forknum, blkno);
+	hash = BufTableHashCode(&tag);
+	partitionLock = BufMappingPartitionLock(hash);
+
+	/*
+	 * Lock the partition of shared_buffers so that it can't be updated
+	 * concurrently.
+	 */
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	/* Try to find the relevant buffer */
+	buffer = BufTableLookup(&tag, hash);
+
+	no_redo_needed = buffer < 0;
+
+	/* we don't have the buffer in memory, update lwLsn past this record */
+	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
+		lfc_evict(rnode, forknum, blkno);
+	}
+	else
+	{
+		SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
+	}
+
+	LWLockRelease(partitionLock);
+
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rnode, forknum, &relsize))
+	{
+		if (relsize < blkno + 1)
+			update_cached_relsize(rnode, forknum, blkno + 1);
+	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rnode = rnode,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		Assert(nbresponse->n_blocks > blkno);
+
+		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
+
+		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
+	}
+
+	return no_redo_needed;
+}
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1964,18 +1964,26 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	{
 		if (safekeeper[i].appendResponse.hs.ts != 0)
 		{
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
+			HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs;
+			if (FullTransactionIdIsNormal(skhs->xmin)
+				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
 			{
-				hs->xmin = safekeeper[i].appendResponse.hs.xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
+				hs->xmin = skhs->xmin;
+				hs->ts = skhs->ts;
 			}
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
+			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
+				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
 			{
-				hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
+				hs->catalog_xmin = skhs->catalog_xmin;
+				hs->ts = skhs->ts;
 			}
 		}
 	}
+
+	if (hs->xmin.value == ~0)
+		hs->xmin = InvalidFullTransactionId;
+	if (hs->catalog_xmin.value == ~0)
+		hs->catalog_xmin = InvalidFullTransactionId;
 }

 /*
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -968,14 +968,14 @@ testing = ["pre-commit"]

 [[package]]
 name = "flask"
-version = "2.1.3"
+version = "2.2.5"
 description = "A simple framework for building complex web applications."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"},
-    {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"},
+    {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"},
+    {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"},
 ]

 [package.dependencies]
@@ -983,7 +983,7 @@ click = ">=8.0"
 importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""}
 itsdangerous = ">=2.0"
 Jinja2 = ">=3.0"
-Werkzeug = ">=2.0"
+Werkzeug = ">=2.2.2"

 [package.extras]
 async = ["asgiref (>=3.2)"]
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -62,6 +62,8 @@ utils.workspace = true
 uuid.workspace = true
 webpki-roots.workspace = true
 x509-parser.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true

 workspace_hack.workspace = true
 tokio-util.workspace = true
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -9,6 +9,7 @@ use crate::{
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};

 #[derive(Debug, Error)]
@@ -87,6 +88,16 @@ pub(super) async fn authenticate(
        .dbname(&db_info.dbname)
        .user(&db_info.user);

+    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
+    // while direct connections do not. Once we migrate to pg_sni_proxy
+    // everywhere, we can remove this.
+    if db_info.host.contains("--") {
+        // we need TLS connection with SNI info to properly route it
+        config.ssl_mode(SslMode::Require);
+    } else {
+        config.ssl_mode(SslMode::Disable);
+    }
+
    if let Some(password) = db_info.password {
        config.password(password.as_ref());
    }
@@ -96,6 +107,7 @@ pub(super) async fn authenticate(
        value: NodeInfo {
            config,
            aux: db_info.aux.into(),
+            allow_self_signed_compute: false, // caller may override
        },
    })
 }
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -0,0 +1,250 @@
+/// A stand-alone program that routes connections, e.g. from
+/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`.
+///
+/// This allows connecting to pods/services running in the same Kubernetes cluster from
+/// the outside. Similar to an ingress controller for HTTPS.
+use std::{net::SocketAddr, sync::Arc};
+
+use tokio::net::TcpListener;
+
+use anyhow::{anyhow, bail, ensure, Context};
+use clap::{self, Arg};
+use futures::TryFutureExt;
+use proxy::console::messages::MetricsAuxInfo;
+use proxy::stream::{PqStream, Stream};
+
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::sync::CancellationToken;
+use utils::{project_git_version, sentry_init::init_sentry};
+
+use tracing::{error, info, warn};
+
+project_git_version!(GIT_VERSION);
+
+fn cli() -> clap::Command {
+    clap::Command::new("Neon proxy/router")
+        .version(GIT_VERSION)
+        .arg(
+            Arg::new("listen")
+                .short('l')
+                .long("listen")
+                .help("listen for incoming client connections on ip:port")
+                .default_value("127.0.0.1:4432"),
+        )
+        .arg(
+            Arg::new("tls-key")
+                .short('k')
+                .long("tls-key")
+                .help("path to TLS key for client postgres connections")
+                .required(true),
+        )
+        .arg(
+            Arg::new("tls-cert")
+                .short('c')
+                .long("tls-cert")
+                .help("path to TLS cert for client postgres connections")
+                .required(true),
+        )
+        .arg(
+            Arg::new("dest")
+                .short('d')
+                .long("destination")
+                .help("append this domain zone to the SNI hostname to get the destination address")
+                .required(true),
+        )
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let _logging_guard = proxy::logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
+
+    let args = cli().get_matches();
+    let destination: String = args.get_one::<String>("dest").unwrap().parse()?;
+
+    // Configure TLS
+    let tls_config: Arc<rustls::ServerConfig> = match (
+        args.get_one::<String>("tls-key"),
+        args.get_one::<String>("tls-cert"),
+    ) {
+        (Some(key_path), Some(cert_path)) => {
+            let key = {
+                let key_bytes = std::fs::read(key_path).context("TLS key file")?;
+                let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                    .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+
+                ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+                keys.pop().map(rustls::PrivateKey).unwrap()
+            };
+
+            let cert_chain_bytes = std::fs::read(cert_path)
+                .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+            let cert_chain = {
+                rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+                    .context(format!(
+                        "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
+                    ))?
+                    .into_iter()
+                    .map(rustls::Certificate)
+                    .collect()
+            };
+
+            rustls::ServerConfig::builder()
+                .with_safe_default_cipher_suites()
+                .with_safe_default_kx_groups()
+                .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
+                .with_no_client_auth()
+                .with_single_cert(cert_chain, key)?
+                .into()
+        }
+        _ => bail!("tls-key and tls-cert must be specified"),
+    };
+
+    // Start listening for incoming client connections
+    let proxy_address: SocketAddr = args.get_one::<String>("listen").unwrap().parse()?;
+    info!("Starting sni router on {proxy_address}");
+    let proxy_listener = TcpListener::bind(proxy_address).await?;
+
+    let cancellation_token = CancellationToken::new();
+
+    let main = proxy::flatten_err(tokio::spawn(task_main(
+        Arc::new(destination),
+        tls_config,
+        proxy_listener,
+        cancellation_token.clone(),
+    )));
+    let signals_task = proxy::flatten_err(tokio::spawn(proxy::handle_signals(cancellation_token)));
+
+    tokio::select! {
+        res = main => { res?; },
+        res = signals_task => { res?; },
+    }
+
+    Ok(())
+}
+
+async fn task_main(
+    dest_suffix: Arc<String>,
+    tls_config: Arc<rustls::ServerConfig>,
+    listener: tokio::net::TcpListener,
+    cancellation_token: CancellationToken,
+) -> anyhow::Result<()> {
+    // When set for the server socket, the keepalive setting
+    // will be inherited by all accepted client sockets.
+    socket2::SockRef::from(&listener).set_keepalive(true)?;
+
+    let mut connections = tokio::task::JoinSet::new();
+
+    loop {
+        tokio::select! {
+            accept_result = listener.accept() => {
+                let (socket, peer_addr) = accept_result?;
+                info!("accepted postgres client connection from {peer_addr}");
+
+                let session_id = uuid::Uuid::new_v4();
+                let tls_config = Arc::clone(&tls_config);
+                let dest_suffix = Arc::clone(&dest_suffix);
+
+                connections.spawn(
+                    async move {
+                        info!("spawned a task for {peer_addr}");
+
+                        socket
+                            .set_nodelay(true)
+                            .context("failed to set socket option")?;
+
+                        handle_client(dest_suffix, tls_config, session_id, socket).await
+                    }
+                    .unwrap_or_else(|e| {
+                        // Acknowledge that the task has finished with an error.
+                        error!("per-client task finished with an error: {e:#}");
+                    }),
+                );
+            }
+            _ = cancellation_token.cancelled() => {
+                drop(listener);
+                break;
+            }
+        }
+    }
+
+    // Drain connections
+    info!("waiting for all client connections to finish");
+    while let Some(res) = connections.join_next().await {
+        if let Err(e) = res {
+            if !e.is_panic() && !e.is_cancelled() {
+                warn!("unexpected error from joined connection task: {e:?}");
+            }
+        }
+    }
+    info!("all client connections have finished");
+    Ok(())
+}
+
+const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
+
+async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    raw_stream: S,
+    tls_config: Arc<rustls::ServerConfig>,
+) -> anyhow::Result<Stream<S>> {
+    let mut stream = PqStream::new(Stream::from_raw(raw_stream));
+
+    let msg = stream.read_startup_packet().await?;
+    info!("received {msg:?}");
+    use pq_proto::FeStartupPacket::*;
+
+    match msg {
+        SslRequest => {
+            stream
+                .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
+                .await?;
+            // Upgrade raw stream into a secure TLS-backed stream.
+            // NOTE: We've consumed `tls`; this fact will be used later.
+
+            let (raw, read_buf) = stream.into_inner();
+            // TODO: Normally, client doesn't send any data before
+            // server says TLS handshake is ok and read_buf is empy.
+            // However, you could imagine pipelining of postgres
+            // SSLRequest + TLS ClientHello in one hunk similar to
+            // pipelining in our node js driver. We should probably
+            // support that by chaining read_buf with the stream.
+            if !read_buf.is_empty() {
+                bail!("data is sent before server replied with EncryptionResponse");
+            }
+            Ok(raw.upgrade(tls_config).await?)
+        }
+        _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?,
+    }
+}
+
+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
+async fn handle_client(
+    dest_suffix: Arc<String>,
+    tls_config: Arc<rustls::ServerConfig>,
+    session_id: uuid::Uuid,
+    stream: impl AsyncRead + AsyncWrite + Unpin,
+) -> anyhow::Result<()> {
+    let tls_stream = ssl_handshake(stream, tls_config).await?;
+
+    // Cut off first part of the SNI domain
+    // We receive required destination details in the format of
+    //   `{k8s_service_name}--{k8s_namespace}--{port}.non-sni-domain`
+    let sni = tls_stream.sni_hostname().ok_or(anyhow!("SNI missing"))?;
+    let dest: Vec<&str> = sni
+        .split_once('.')
+        .context("invalid SNI")?
+        .0
+        .splitn(3, "--")
+        .collect();
+    let port = dest[2].parse::<u16>().context("invalid port")?;
+    let destination = format!("{}.{}.{}:{}", dest[0], dest[1], dest_suffix, port);
+
+    info!("destination: {}", destination);
+
+    let client = tokio::net::TcpStream::connect(destination).await?;
+
+    let metrics_aux: MetricsAuxInfo = Default::default();
+    proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
+}
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,49 +1,23 @@
-//! Postgres protocol proxy/router.
-//!
-//! This service listens psql port and can check auth via external service
-//! (control plane API in our case) and can create new databases and accounts
-//! in somewhat transparent manner (again via communication with control plane API).
+use proxy::auth;
+use proxy::console;
+use proxy::http;
+use proxy::metrics;

-mod auth;
-mod cache;
-mod cancellation;
-mod compute;
-mod config;
-mod console;
-mod error;
-mod http;
-mod logging;
-mod metrics;
-mod parse;
-mod proxy;
-mod sasl;
-mod scram;
-mod stream;
-mod url;
-mod waiters;
-
-use anyhow::{bail, Context};
+use anyhow::bail;
 use clap::{self, Arg};
-use config::ProxyConfig;
-use futures::FutureExt;
-use std::{borrow::Cow, future::Future, net::SocketAddr};
-use tokio::{net::TcpListener, task::JoinError};
+use proxy::config::{self, ProxyConfig};
+use std::{borrow::Cow, net::SocketAddr};
+use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
-use tracing::{info, warn};
+use tracing::info;
+use tracing::warn;
 use utils::{project_git_version, sentry_init::init_sentry};

 project_git_version!(GIT_VERSION);

-/// Flattens `Result<Result<T>>` into `Result<T>`.
-async fn flatten_err(
-    f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
-) -> anyhow::Result<()> {
-    f.map(|r| r.context("join error").and_then(|x| x)).await
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    let _logging_guard = logging::init().await?;
+    let _logging_guard = proxy::logging::init().await?;
    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

@@ -69,7 +43,7 @@ async fn main() -> anyhow::Result<()> {
    let proxy_listener = TcpListener::bind(proxy_address).await?;
    let cancellation_token = CancellationToken::new();

-    let mut client_tasks = vec![tokio::spawn(proxy::task_main(
+    let mut client_tasks = vec![tokio::spawn(proxy::proxy::task_main(
        config,
        proxy_listener,
        cancellation_token.clone(),
@@ -88,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
    }

    let mut tasks = vec![
-        tokio::spawn(handle_signals(cancellation_token)),
+        tokio::spawn(proxy::handle_signals(cancellation_token)),
        tokio::spawn(http::server::task_main(http_listener)),
        tokio::spawn(console::mgmt::task_main(mgmt_listener)),
    ];
@@ -97,8 +71,9 @@ async fn main() -> anyhow::Result<()> {
        tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
    }

-    let tasks = futures::future::try_join_all(tasks.into_iter().map(flatten_err));
-    let client_tasks = futures::future::try_join_all(client_tasks.into_iter().map(flatten_err));
+    let tasks = futures::future::try_join_all(tasks.into_iter().map(proxy::flatten_err));
+    let client_tasks =
+        futures::future::try_join_all(client_tasks.into_iter().map(proxy::flatten_err));
    tokio::select! {
        // We are only expecting an error from these forever tasks
        res = tasks => { res?; },
@@ -107,33 +82,6 @@ async fn main() -> anyhow::Result<()> {
    Ok(())
 }

-/// Handle unix signals appropriately.
-async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> {
-    use tokio::signal::unix::{signal, SignalKind};
-
-    let mut hangup = signal(SignalKind::hangup())?;
-    let mut interrupt = signal(SignalKind::interrupt())?;
-    let mut terminate = signal(SignalKind::terminate())?;
-
-    loop {
-        tokio::select! {
-            // Hangup is commonly used for config reload.
-            _ = hangup.recv() => {
-                warn!("received SIGHUP; config reload is not supported");
-            }
-            // Shut down the whole application.
-            _ = interrupt.recv() => {
-                warn!("received SIGINT, exiting immediately");
-                bail!("interrupted");
-            }
-            _ = terminate.recv() => {
-                warn!("received SIGTERM, shutting down once all existing connections have closed");
-                token.cancel();
-            }
-        }
-    }
-}
-
 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> {
    let tls_config = match (
@@ -149,6 +97,14 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
        _ => bail!("either both or neither tls-key and tls-cert must be specified"),
    };

+    let allow_self_signed_compute: bool = args
+        .get_one::<String>("allow-self-signed-compute")
+        .unwrap()
+        .parse()?;
+    if allow_self_signed_compute {
+        warn!("allowing self-signed compute certificates");
+    }
+
    let metric_collection = match (
        args.get_one::<String>("metric-collection-endpoint"),
        args.get_one::<String>("metric-collection-interval"),
@@ -198,6 +154,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
        tls_config,
        auth_backend,
        metric_collection,
+        allow_self_signed_compute,
    }));

    Ok(config)
@@ -288,6 +245,12 @@ fn cli() -> clap::Command {
                .help("cache for `wake_compute` api method (use `size=0` to disable)")
                .default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO),
        )
+        .arg(
+            Arg::new("allow-self-signed-compute")
+                .long("allow-self-signed-compute")
+                .help("Allow self-signed certificates for compute nodes (for testing)")
+                .default_value("false"),
+        )
 }

 #[cfg(test)]
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,11 +1,11 @@
 use crate::{cancellation::CancelClosure, error::UserFacingError};
-use futures::TryFutureExt;
+use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{io, net::SocketAddr};
+use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio_postgres::NoTls;
+use tokio_postgres::tls::MakeTlsConnect;
 use tracing::{error, info, warn};

 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -19,6 +19,9 @@ pub enum ConnectionError {

    #[error("{COULD_NOT_CONNECT}: {0}")]
    CouldNotConnect(#[from] io::Error),
+
+    #[error("{COULD_NOT_CONNECT}: {0}")]
+    TlsError(#[from] native_tls::Error),
 }

 impl UserFacingError for ConnectionError {
@@ -125,14 +128,34 @@ impl std::ops::DerefMut for ConnCfg {
    }
 }

+impl Default for ConnCfg {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl ConnCfg {
    /// Establish a raw TCP connection to the compute node.
-    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
+    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> {
        use tokio_postgres::config::Host;

+        // wrap TcpStream::connect with timeout
+        let connect_with_timeout = |host, port| {
+            let connection_timeout = Duration::from_millis(10000);
+            tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
+                move |res| match res {
+                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
+                    Err(_) => Err(io::Error::new(
+                        io::ErrorKind::TimedOut,
+                        format!("exceeded connection timeout {connection_timeout:?}"),
+                    )),
+                },
+            )
+        };
+
        let connect_once = |host, port| {
            info!("trying to connect to compute node at {host}:{port}");
-            TcpStream::connect((host, port)).and_then(|socket| async {
+            connect_with_timeout(host, port).and_then(|socket| async {
                let socket_addr = socket.peer_addr()?;
                // This prevents load balancer from severing the connection.
                socket2::SockRef::from(&socket).set_keepalive(true)?;
@@ -165,9 +188,8 @@ impl ConnCfg {
                Host::Unix(_) => continue, // unix sockets are not welcome here
            };

-            // TODO: maybe we should add a timeout.
            match connect_once(host, *port).await {
-                Ok(socket) => return Ok(socket),
+                Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)),
                Err(err) => {
                    // We can't throw an error here, as there might be more hosts to try.
                    warn!("couldn't connect to compute node at {host}:{port}: {err}");
@@ -187,7 +209,10 @@ impl ConnCfg {

 pub struct PostgresConnection {
    /// Socket connected to a compute node.
-    pub stream: TcpStream,
+    pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
+        tokio::net::TcpStream,
+        postgres_native_tls::TlsStream<tokio::net::TcpStream>,
+    >,
    /// PostgreSQL connection parameters.
    pub params: std::collections::HashMap<String, String>,
    /// Query cancellation token.
@@ -195,11 +220,27 @@ pub struct PostgresConnection {
 }

 impl ConnCfg {
-    async fn do_connect(&self) -> Result<PostgresConnection, ConnectionError> {
-        // TODO: establish a secure connection to the DB.
-        let (socket_addr, mut stream) = self.connect_raw().await?;
-        let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?;
-        info!("connected to compute node at {socket_addr}");
+    async fn do_connect(
+        &self,
+        allow_self_signed_compute: bool,
+    ) -> Result<PostgresConnection, ConnectionError> {
+        let (socket_addr, stream, host) = self.connect_raw().await?;
+
+        let tls_connector = native_tls::TlsConnector::builder()
+            .danger_accept_invalid_certs(allow_self_signed_compute)
+            .build()
+            .unwrap();
+        let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
+        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
+
+        // connect_raw() will not use TLS if sslmode is "disable"
+        let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        let stream = connection.stream.into_inner();
+
+        info!(
+            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
+            self.0.get_ssl_mode()
+        );

        // This is very ugly but as of now there's no better way to
        // extract the connection parameters from tokio-postgres' connection.
@@ -220,8 +261,11 @@ impl ConnCfg {
    }

    /// Connect to a corresponding compute node.
-    pub async fn connect(&self) -> Result<PostgresConnection, ConnectionError> {
-        self.do_connect()
+    pub async fn connect(
+        &self,
+        allow_self_signed_compute: bool,
+    ) -> Result<PostgresConnection, ConnectionError> {
+        self.do_connect(allow_self_signed_compute)
            .inspect_err(|err| {
                // Immediately log the error we have at our disposal.
                error!("couldn't connect to compute node: {err}");
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -12,6 +12,7 @@ pub struct ProxyConfig {
    pub tls_config: Option<TlsConfig>,
    pub auth_backend: auth::BackendType<'static, ()>,
    pub metric_collection: Option<MetricCollectionConfig>,
+    pub allow_self_signed_compute: bool,
 }

 #[derive(Debug)]
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -170,6 +170,9 @@ pub struct NodeInfo {

    /// Labels for proxy's metrics.
    pub aux: Arc<MetricsAuxInfo>,
+
+    /// Whether we should accept self-signed certificates (for testing)
+    pub allow_self_signed_compute: bool,
 }

 pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -8,6 +8,7 @@ use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUr
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use thiserror::Error;
+use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};

 #[derive(Debug, Error)]
@@ -86,11 +87,13 @@ impl Api {
        let mut config = compute::ConnCfg::new();
        config
            .host(self.endpoint.host_str().unwrap_or("localhost"))
-            .port(self.endpoint.port().unwrap_or(5432));
+            .port(self.endpoint.port().unwrap_or(5432))
+            .ssl_mode(SslMode::Disable);

        let node = NodeInfo {
            config,
            aux: Default::default(),
+            allow_self_signed_compute: false,
        };

        Ok(node)
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};

 #[derive(Clone)]
@@ -100,11 +101,12 @@ impl Api {
            // We'll set username and such later using the startup message.
            // TODO: add more type safety (in progress).
            let mut config = compute::ConnCfg::new();
-            config.host(host).port(port);
+            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.

            let node = NodeInfo {
                config,
                aux: body.aux.into(),
+                allow_self_signed_compute: false,
            };

            Ok(node)
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -0,0 +1,57 @@
+use anyhow::{bail, Context};
+use futures::{Future, FutureExt};
+use tokio::task::JoinError;
+use tokio_util::sync::CancellationToken;
+use tracing::warn;
+
+pub mod auth;
+pub mod cache;
+pub mod cancellation;
+pub mod compute;
+pub mod config;
+pub mod console;
+pub mod error;
+pub mod http;
+pub mod logging;
+pub mod metrics;
+pub mod parse;
+pub mod proxy;
+pub mod sasl;
+pub mod scram;
+pub mod stream;
+pub mod url;
+pub mod waiters;
+
+/// Handle unix signals appropriately.
+pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<()> {
+    use tokio::signal::unix::{signal, SignalKind};
+
+    let mut hangup = signal(SignalKind::hangup())?;
+    let mut interrupt = signal(SignalKind::interrupt())?;
+    let mut terminate = signal(SignalKind::terminate())?;
+
+    loop {
+        tokio::select! {
+            // Hangup is commonly used for config reload.
+            _ = hangup.recv() => {
+                warn!("received SIGHUP; config reload is not supported");
+            }
+            // Shut down the whole application.
+            _ = interrupt.recv() => {
+                warn!("received SIGINT, exiting immediately");
+                bail!("interrupted");
+            }
+            _ = terminate.recv() => {
+                warn!("received SIGTERM, shutting down once all existing connections have closed");
+                token.cancel();
+            }
+        }
+    }
+}
+
+/// Flattens `Result<Result<T>>` into `Result<T>`.
+pub async fn flatten_err(
+    f: impl Future<Output = Result<anyhow::Result<()>, JoinError>>,
+) -> anyhow::Result<()> {
+    f.map(|r| r.context("join error").and_then(|x| x)).await
+}
--- a/Show More
+++ b/Show More