Remove pageserver from deploy script

[compute_ctl] Do not initialize last_active on start (#4137 )
Our scale-to-zero logic was optimized for short auto-suspend intervals, e.g. minutes or hours. In this case, if compute was restarted by k8s due to some reason (OOM, k8s node went down, pod relocation, etc.), `last_active` got bumped, we start counting auto-suspend timeout again. It's not a big deal, i.e. we suspend completely idle compute not after 5 minutes, but after 10 minutes or so. Yet, some clients may want days or even weeks. And chance that compute could be restarted during this interval is pretty high, but in this case we could be not able to suspend some computes for weeks. After this commit, we won't initialize `last_active` on start, so `/status` could return an unset attribute. This means that there was no user activity since start. Control-plane should deal with it by taking `max()` out of all available activity timestamps: `started_at`, `last_active`, etc. compute_ctl part of neondatabase/cloud#4853
2026-05-20 22:50:38 +00:00 · 2023-05-05 18:27:56 +03:00 · 2023-05-05 11:45:37 +02:00 · 2023-05-05 02:57:47 +03:00 · 2023-05-04 18:22:04 +01:00 · 2023-05-04 18:10:40 +03:00
195 changed files with 8215 additions and 2434 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -4,7 +4,7 @@
 hakari-package = "workspace_hack"

 # Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
-dep-format-version = "3"
+dep-format-version = "4"

 # Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
 # Hakari works much better with the new feature resolver.
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -17,7 +17,7 @@ storage:
          kind: "LayerAccessThreshold"
          period: "10m"
          threshold: &default_eviction_threshold "24h"
-      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -17,7 +17,7 @@ storage:
          kind: "LayerAccessThreshold"
          period: "10m"
          threshold: &default_eviction_threshold "24h"
-      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/prod.us-east-1.hosts.yaml
+++ b/.github/ansible/prod.us-east-1.hosts.yaml
@@ -0,0 +1,50 @@
+storage:
+  vars:
+    bucket_name: neon-prod-storage-us-east-1
+    bucket_region: us-east-1
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
+    broker_endpoint: http://storage-broker-lb.theta.us-east-1.internal.aws.neon.tech:50051
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
+      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "10m"
+          threshold: &default_eviction_threshold "24h"
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: us-east-1
+    ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-1
+    console_region_id: aws-us-east-1
+    sentry_environment: production
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.us-east-1.aws.neon.tech:
+          ansible_host: i-085222088b0d2e0c7
+        pageserver-1.us-east-1.aws.neon.tech:
+          ansible_host: i-0969d4f684d23a21e
+        pageserver-2.us-east-1.aws.neon.tech:
+          ansible_host: i-05dee87895da58dad
+
+    safekeepers:
+      hosts:
+        safekeeper-0.us-east-1.aws.neon.tech:
+          ansible_host: i-04ce739e88793d864
+        safekeeper-1.us-east-1.aws.neon.tech:
+          ansible_host: i-0e9e6c9227fb81410
+        safekeeper-2.us-east-1.aws.neon.tech:
+          ansible_host: i-072f4dd86a327d52f
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -17,7 +17,7 @@ storage:
          kind: "LayerAccessThreshold"
          period: "10m"
          threshold: &default_eviction_threshold "24h"
-      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -17,7 +17,7 @@ storage:
          kind: "LayerAccessThreshold"
          period: "10m"
          threshold: &default_eviction_threshold "24h"
-      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
@@ -34,13 +34,19 @@ storage:
    pageservers:
      hosts:
        pageserver-0.us-west-2.aws.neon.tech:
-          ansible_host: i-0d9f6dfae0e1c780d 
+          ansible_host: i-0d9f6dfae0e1c780d
        pageserver-1.us-west-2.aws.neon.tech:
          ansible_host: i-0c834be1dddba8b3f
        pageserver-2.us-west-2.aws.neon.tech:
          ansible_host: i-051642d372c0a4f32
        pageserver-3.us-west-2.aws.neon.tech:
          ansible_host: i-00c3844beb9ad1c6b
+        pageserver-4.us-west-2.aws.neon.tech:
+          ansible_host: i-013263dd1c239adcc
+        pageserver-6.us-west-2.aws.neon.tech:
+          ansible_host: i-01cdf7d2bc1433b6a
+        pageserver-7.us-west-2.aws.neon.tech:
+          ansible_host: i-02eec9b40617db5bc

    safekeepers:
      hosts:
@@ -49,5 +55,16 @@ storage:
        safekeeper-1.us-west-2.aws.neon.tech:
          ansible_host: i-074682f9d3c712e7c
        safekeeper-2.us-west-2.aws.neon.tech:
-          ansible_host: i-042b7efb1729d7966 
-          
+          ansible_host: i-042b7efb1729d7966
+        safekeeper-3.us-west-2.aws.neon.tech:
+          ansible_host: i-089f6b9ef426dff76
+        safekeeper-4.us-west-2.aws.neon.tech:
+          ansible_host: i-0fe6bf912c4710c82
+        safekeeper-5.us-west-2.aws.neon.tech:
+          ansible_host: i-0a83c1c46d2b4e409
+        safekeeper-6.us-west-2.aws.neon.tech:
+          ansible_host: i-0fef5317b8fdc9f8d
+        safekeeper-7.us-west-2.aws.neon.tech:
+          ansible_host: i-0be739190d4289bf9
+        safekeeper-8.us-west-2.aws.neon.tech:
+          ansible_host: i-00e851803669e5cfe                    
--- a/.github/ansible/staging.eu-central-1.hosts.yaml
+++ b/.github/ansible/staging.eu-central-1.hosts.yaml
@@ -0,0 +1,47 @@
+storage:
+  vars:
+    bucket_name: neon-dev-storage-eu-central-1
+    bucket_region: eu-central-1
+    # We only register/update storage in one preview console and manually copy to other instances
+    console_mgmt_base_url: http://neon-internal-api.helium.aws.neon.build
+    broker_endpoint: http://storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build:50051
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://neon-internal-api.helium.aws.neon.build/billing/api/v1/usage_events
+      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: &default_eviction_threshold "20m"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-central-1
+    ansible_aws_ssm_bucket_name: neon-dev-storage-eu-central-1
+    console_region_id: aws-eu-central-1
+    sentry_environment: staging
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-central-1.aws.neon.build:
+          ansible_host: i-011f93ec26cfba2d4
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-central-1.aws.neon.build:
+          ansible_host: i-0ff026d27babf8ddd
+        safekeeper-1.eu-central-1.aws.neon.build:
+          ansible_host: i-03983a49ee54725d9
+        safekeeper-2.eu-central-1.aws.neon.build:
+          ansible_host: i-0bd025ecdb61b0db3
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -17,7 +17,7 @@ storage:
          kind: "LayerAccessThreshold"
          period: "20m"
          threshold: &default_eviction_threshold "20m"
-      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
@@ -35,6 +35,8 @@ storage:
      hosts:
        pageserver-0.eu-west-1.aws.neon.build:
          ansible_host: i-01d496c5041c7f34c
+        pageserver-1.eu-west-1.aws.neon.build:
+          ansible_host: i-0e8013e239ce3928c

    safekeepers:
      hosts:
@@ -44,3 +46,15 @@ storage:
          ansible_host: i-06969ee1bf2958bfc
        safekeeper-2.eu-west-1.aws.neon.build:
          ansible_host: i-087892e9625984a0b
+        safekeeper-3.eu-west-1.aws.neon.build:
+          ansible_host: i-0a6f91660e99e8891
+        safekeeper-4.eu-west-1.aws.neon.build:
+          ansible_host: i-0012e309e28e7c249
+        safekeeper-5.eu-west-1.aws.neon.build:
+          ansible_host: i-085a2b1193287b32e
+        safekeeper-6.eu-west-1.aws.neon.build:
+          ansible_host: i-0c713248465ed0fbd
+        safekeeper-7.eu-west-1.aws.neon.build:
+          ansible_host: i-02ad231aed2a80b7a
+        safekeeper-8.eu-west-1.aws.neon.build:
+          ansible_host: i-0dbbd8ffef66efda8
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -17,7 +17,7 @@ storage:
          kind: "LayerAccessThreshold"
          period: "20m"
          threshold: &default_eviction_threshold "20m"
-      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+        evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
@@ -48,9 +48,9 @@ storage:
      hosts:
        safekeeper-0.us-east-2.aws.neon.build:
          ansible_host: i-027662bd552bf5db0
-        safekeeper-1.us-east-2.aws.neon.build:
-          ansible_host: i-0171efc3604a7b907
        safekeeper-2.us-east-2.aws.neon.build:
          ansible_host: i-0de0b03a51676a6ce
+        safekeeper-3.us-east-2.aws.neon.build:
+          ansible_host: i-05f8ba2cda243bd18
        safekeeper-99.us-east-2.aws.neon.build:
          ansible_host: i-0d61b6a2ea32028d5
--- a/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
@@ -0,0 +1,52 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
+
+ingress:
+  enabled: false
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
+settings:
+  sentryEnvironment: "staging"
--- a/.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml
+++ b/.github/helm-values/dev-eu-central-1-alpha.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build"
+
+settings:
+  domain: "*.snirouter.alpha.eu-central-1.internal.aws.neon.build"
+  sentryEnvironment: "staging"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -7,13 +7,13 @@ deploymentStrategy:
    maxSurge: 100%
    maxUnavailable: 50%

-# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# Delay the kill signal by 5 minutes (5 * 60)
 # The pod(s) will stay in Terminating, keeps the existing connections
 # but doesn't receive new ones
 containerLifecycle:
  preStop:
    exec:
-      command: ["/bin/sh", "-c", "sleep 604800"]
+      command: ["/bin/sh", "-c", "sleep 300"]
 terminationGracePeriodSeconds: 604800

 image:
@@ -23,6 +23,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.eu-west-1.aws.neon.build"
+  otelExporterOtlpEndpoint: "https://otel-collector.zeta.eu-west-1.internal.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
--- a/.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build"
+
+settings:
+  domain: "*.snirouter.zeta.eu-west-1.internal.aws.neon.build"
+  sentryEnvironment: "staging"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -9,6 +9,7 @@ settings:
  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
  uri: "https://console.stage.neon.tech/psql_session/"
  domain: "pg.neon.build"
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
  sentryEnvironment: "staging"
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
  metricCollectionInterval: "1min"
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -1,6 +1,22 @@
 # Helm chart values for neon-proxy-scram.
 # This is a YAML-formatted file.

+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+# Delay the kill signal by 5 minutes (5 * 60)
+# The pod(s) will stay in Terminating, keeps the existing connections
+# but doesn't receive new ones
+containerLifecycle:
+  preStop:
+    exec:
+      command: ["/bin/sh", "-c", "sleep 300"]
+terminationGracePeriodSeconds: 604800
+
+
 image:
  repository: neondatabase/neon

@@ -8,6 +24,7 @@ settings:
  authBackend: "console"
  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.cloud.stage.neon.tech"
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -7,15 +7,16 @@ deploymentStrategy:
    maxSurge: 100%
    maxUnavailable: 50%

-# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# Delay the kill signal by 5 minutes (5 * 60)
 # The pod(s) will stay in Terminating, keeps the existing connections
 # but doesn't receive new ones
 containerLifecycle:
  preStop:
    exec:
-      command: ["/bin/sh", "-c", "sleep 604800"]
+      command: ["/bin/sh", "-c", "sleep 300"]
 terminationGracePeriodSeconds: 604800

+
 image:
  repository: neondatabase/neon

@@ -24,6 +25,7 @@ settings:
  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
  domain: "*.us-east-2.aws.neon.build"
  extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
  sentryEnvironment: "staging"
  wssPort: 8443
  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
--- a/.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.beta.us-east-2.internal.aws.neon.build"
+
+settings:
+  domain: "*.snirouter.beta.us-east-2.internal.aws.neon.build"
+  sentryEnvironment: "staging"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/preview-template.neon-proxy-scram.yaml
+++ b/.github/helm-values/preview-template.neon-proxy-scram.yaml
@@ -0,0 +1,67 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/management/api/v2"
+  domain: "*.cloud.${PREVIEW_NAME}.aws.neon.build"
+  sentryEnvironment: "staging"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/billing/api/v1/usage_events"
+  metricCollectionInterval: "1min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  neon_service: proxy-scram
+  neon_env: test
+  neon_region: ${PREVIEW_NAME}.eu-central-1
+
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: cloud.${PREVIEW_NAME}.aws.neon.build
+  httpsPort: 443
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -7,13 +7,13 @@ deploymentStrategy:
    maxSurge: 100%
    maxUnavailable: 50%

-# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# Delay the kill signal by 5 minutes (5 * 60)
 # The pod(s) will stay in Terminating, keeps the existing connections
 # but doesn't receive new ones
 containerLifecycle:
  preStop:
    exec:
-      command: ["/bin/sh", "-c", "sleep 604800"]
+      command: ["/bin/sh", "-c", "sleep 300"]
 terminationGracePeriodSeconds: 604800


--- a/.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.epsilon.ap-southeast-1.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -7,13 +7,13 @@ deploymentStrategy:
    maxSurge: 100%
    maxUnavailable: 50%

-# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# Delay the kill signal by 5 minutes (5 * 60)
 # The pod(s) will stay in Terminating, keeps the existing connections
 # but doesn't receive new ones
 containerLifecycle:
  preStop:
    exec:
-      command: ["/bin/sh", "-c", "sleep 604800"]
+      command: ["/bin/sh", "-c", "sleep 300"]
 terminationGracePeriodSeconds: 604800


--- a/.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.gamma.eu-central-1.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
@@ -0,0 +1,69 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+# Delay the kill signal by 5 minutes (5 * 60)
+# The pod(s) will stay in Terminating, keeps the existing connections
+# but doesn't receive new ones
+containerLifecycle:
+  preStop:
+    exec:
+      command: ["/bin/sh", "-c", "sleep 300"]
+terminationGracePeriodSeconds: 604800
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
+  domain: "*.us-east-1.aws.neon.tech"
+  # *.us-east-1.retooldb.com hasn't been delegated yet.
+  extraDomains: ["*.us-east-1.postgres.vercel-storage.com"]
+  sentryEnvironment: "production"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
+  metricCollectionInterval: "10min"
+
+podLabels:
+  neon_service: proxy-scram
+  neon_env: prod
+  neon_region: us-east-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-east-1.aws.neon.tech
+  httpsPort: 443
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
--- a/.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-1-theta.neon-storage-broker.yaml
@@ -0,0 +1,52 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.theta.us-east-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
+
+ingress:
+  enabled: false
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
+settings:
+  sentryEnvironment: "production"
--- a/.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml
+++ b/.github/helm-values/prod-us-east-1-theta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.theta.us-east-1.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.theta.us-east-1.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -7,13 +7,13 @@ deploymentStrategy:
    maxSurge: 100%
    maxUnavailable: 50%

-# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# Delay the kill signal by 5 minutes (5 * 60)
 # The pod(s) will stay in Terminating, keeps the existing connections
 # but doesn't receive new ones
 containerLifecycle:
  preStop:
    exec:
-      command: ["/bin/sh", "-c", "sleep 604800"]
+      command: ["/bin/sh", "-c", "sleep 300"]
 terminationGracePeriodSeconds: 604800


--- a/.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.delta.us-east-2.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.delta.us-east-2.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -7,13 +7,13 @@ deploymentStrategy:
    maxSurge: 100%
    maxUnavailable: 50%

-# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# Delay the kill signal by 5 minutes (5 * 60)
 # The pod(s) will stay in Terminating, keeps the existing connections
 # but doesn't receive new ones
 containerLifecycle:
  preStop:
    exec:
-      command: ["/bin/sh", "-c", "sleep 604800"]
+      command: ["/bin/sh", "-c", "sleep 300"]
 terminationGracePeriodSeconds: 604800


--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -7,13 +7,13 @@ deploymentStrategy:
    maxSurge: 100%
    maxUnavailable: 50%

-# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# Delay the kill signal by 5 minutes (5 * 60)
 # The pod(s) will stay in Terminating, keeps the existing connections
 # but doesn't receive new ones
 containerLifecycle:
  preStop:
    exec:
-      command: ["/bin/sh", "-c", "sleep 604800"]
+      command: ["/bin/sh", "-c", "sleep 300"]
 terminationGracePeriodSeconds: 604800


--- a/.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.pg-sni-router.yaml
@@ -0,0 +1,19 @@
+useCertManager: true
+
+replicaCount: 3
+
+exposedService:
+  # exposedService.port -- Exposed Service proxy port
+  port: 4432
+  annotations:
+    external-dns.alpha.kubernetes.io/hostname: "*.snirouter.eta.us-west-2.internal.aws.neon.tech"
+
+settings:
+  domain: "*.snirouter.eta.us-west-2.internal.aws.neon.tech"
+  sentryEnvironment: "production"
+
+imagePullSecrets:
+  - name: docker-hub-neon
+
+metrics:
+  enabled: false
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -111,8 +111,21 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

-      - name: Run cargo clippy
-        run: ./run_clippy.sh
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
@@ -405,10 +418,7 @@ jobs:
      - uses: actions/github-script@v6
        if: >
          !cancelled() &&
-          github.event_name == 'pull_request' && (
-            steps.create-allure-report-debug.outputs.report-url ||
-            steps.create-allure-report-release.outputs.report-url
-          )
+          github.event_name == 'pull_request'
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
@@ -541,7 +551,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ push-docker-hub, tag ]
+    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -584,8 +594,7 @@ jobs:
  neon-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
-    # https://github.com/GoogleContainerTools/kaniko/issues/2005
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
        shell: sh -eu {0}
@@ -597,11 +606,32 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

      - name: Kaniko build neon
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}

      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
@@ -652,7 +682,7 @@ jobs:
  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
        shell: sh -eu {0}
@@ -661,18 +691,41 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v1 # v3 won't work with kaniko

-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

      - name: Kaniko build compute tools
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --dockerfile Dockerfile.compute-tools
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}

+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

  compute-node-image:
    runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    needs: [ tag ]
    strategy:
      fail-fast: false
@@ -689,12 +742,36 @@ jobs:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF

      - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg PG_VERSION=${{ matrix.version }}
+                           --dockerfile Dockerfile.compute-node
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

@@ -786,13 +863,11 @@ jobs:
    runs-on: [ self-hosted, gen3, small ]
    needs: [ tag, test-images, vm-compute-node-image ]
    container: golang:1.19-bullseye
-    if: github.event_name != 'workflow_dispatch'
+    # Don't add if-condition here.
+    # The job should always be run because we have dependant other jobs that shouldn't be skipped

    steps:
      - name: Install Crane & ECR helper
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
        run: |
          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
@@ -802,10 +877,15 @@ jobs:
          mkdir /github/home/.docker/
          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json

+      - name: Copy vm-compute-node images to Docker Hub
+        run: |
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+
      - name: Add latest tag to images
        if: |
          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
        run: |
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -814,50 +894,10 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest

-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
-
-  push-docker-hub:
-    runs-on: [ self-hosted, dev, x64 ]
-    needs: [ promote-images, tag ]
-    container: golang:1.19-bullseye
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Pull neon image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon
-
-      - name: Pull compute tools image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
-
-      - name: Pull compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
-
-      - name: Pull vm compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-
-      - name: Pull compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
-
-      - name: Pull vm compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-
-      - name: Pull rust image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
-
      - name: Push images to production ECR
        if: |
          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -872,28 +912,12 @@ jobs:
          echo "" > /github/home/.docker/config.json
          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io

-      - name: Push neon image to Docker Hub
-        run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      - name: Push vm-compute-node to Docker Hub
+        run: |
+          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}

-      - name: Push compute tools image to Docker Hub
-        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v14 image to Docker Hub
-        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v14 image to Docker Hub
-        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v15 image to Docker Hub
-        run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v15 image to Docker Hub
-        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push rust image to Docker Hub
-        run: crane push rust neondatabase/rust:pinned
-
-      - name: Add latest tag to images in Docker Hub
+      - name: Push latest tags to Docker Hub
        if: |
          (github.ref_name == 'main' || github.ref_name == 'release') &&
          github.event_name != 'workflow_dispatch'
@@ -913,7 +937,7 @@ jobs:
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
      github.event_name != 'workflow_dispatch'
@@ -947,7 +971,7 @@ jobs:
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
@@ -984,7 +1008,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
    steps:
      - name: Promote compatibility snapshot for the release
--- a/.github/workflows/deploy-dev.yml
+++ b/.github/workflows/deploy-dev.yml
@@ -27,6 +27,11 @@ on:
        required: true
        type: boolean
        default: true
+      deployPgSniRouter:
+        description: 'Deploy pg-sni-router'
+        required: true
+        type: boolean
+        default: true

 env:
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
@@ -48,7 +53,8 @@ jobs:
        shell: bash
    strategy:
      matrix:
-        target_region: [ eu-west-1, us-east-2 ]
+        # TODO(sergey): Fix storage deploy in eu-central-1
+        target_region: [ eu-west-1, us-east-2]
    environment:
      name: dev-${{ matrix.target_region }}
    steps:
@@ -133,6 +139,53 @@ jobs:
  
      - name: Cleanup helm folder
        run: rm -rf ~/.cache
+
+  deploy-preview-proxy-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  eu-central-1
+            target_cluster: dev-eu-central-1-alpha
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Re-deploy preview proxies
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          for PREVIEW_NAME in helium argon krypton xenon radon oganesson hydrogen nitrogen oxygen fluorine chlorine; do
+            export PREVIEW_NAME
+            envsubst <.github/helm-values/preview-template.neon-proxy-scram.yaml >preview-${PREVIEW_NAME}.neon-proxy-scram.yaml
+            helm upgrade neon-proxy-scram-${PREVIEW_NAME} neondatabase/neon-proxy --namespace neon-proxy-${PREVIEW_NAME} --create-namespace --install --atomic -f preview-${PREVIEW_NAME}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          done
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
  
  deploy-storage-broker-new:
    runs-on: [ self-hosted, gen3, small ]
@@ -148,6 +201,8 @@ jobs:
            target_cluster: dev-us-east-2-beta
          - target_region:  eu-west-1
            target_cluster: dev-eu-west-1-zeta
+          - target_region:  eu-central-1
+            target_cluster: dev-eu-central-1-alpha
    environment:
      name: dev-${{ matrix.target_region }}
    steps:
@@ -177,3 +232,49 @@ jobs:
  
      - name: Cleanup helm folder
        run: rm -rf ~/.cache
+
+  deploy-pg-sni-router:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployPgSniRouter
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+          - target_region:  eu-central-1
+            target_cluster: dev-eu-central-1-alpha
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy pg-sni-router
+        run:
+          helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -27,6 +27,11 @@ on:
        required: true
        type: boolean
        default: true
+      deployPgSniRouter:
+        description: 'Deploy pg-sni-router'
+        required: true
+        type: boolean
+        default: true
      disclamerAcknowledged:
        description: 'I confirm that there is an emergency and I can not use regular release workflow'
        required: true
@@ -49,7 +54,7 @@ jobs:
        shell: bash
    strategy:
      matrix:
-        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1, us-east-1 ]
    environment:
      name: prod-${{ matrix.target_region }}
    steps:
@@ -97,6 +102,10 @@ jobs:
            target_cluster: prod-ap-southeast-1-epsilon
            deploy_link_proxy: false
            deploy_legacy_scram_proxy: false
+          - target_region: us-east-1
+            target_cluster: prod-us-east-1-theta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
    environment:
      name: prod-${{ matrix.target_region }}
    steps:
@@ -147,6 +156,8 @@ jobs:
            target_cluster: prod-eu-central-1-gamma
          - target_region: ap-southeast-1
            target_cluster: prod-ap-southeast-1-epsilon
+          - target_region: us-east-1
+            target_cluster: prod-us-east-1-theta
    environment:
      name: prod-${{ matrix.target_region }}
    steps:
@@ -165,3 +176,42 @@ jobs:
      - name: Deploy storage-broker
        run:
          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+  deploy-pg-sni-router:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployPgSniRouter && inputs.disclamerAcknowledged
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+          - target_region: us-east-1
+            target_cluster: prod-us-east-1-theta
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy pg-sni-router
+        run:
+          helm upgrade neon-pg-sni-router neondatabase/neon-pg-sni-router --namespace neon-pg-sni-router --create-namespace --install --debug --atomic -f .github/helm-values/${{ matrix.target_cluster }}.pg-sni-router.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 15m0s
--- a/.neon_clippy_args
+++ b/.neon_clippy_args
@@ -0,0 +1,4 @@
+# * `-A unknown_lints` – do not warn about unknown lint suppressions
+#                        that people with newer toolchains might use
+# * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
+export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,10 +24,10 @@ atty = "0.2.14"
 aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.21.0"
 aws-smithy-http = "0.51.0"
-aws-types = "0.51.0"
+aws-types = "0.55"
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.61"
+bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
@@ -50,7 +50,7 @@ git-version = "0.3"
 hashbrown = "0.13"
 hashlink = "0.8.1"
 hex = "0.4"
-hex-literal = "0.3"
+hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
 humantime = "2.1"
@@ -62,6 +62,7 @@ jsonwebtoken = "8"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
+native-tls = "0.2"
 nix = "0.26"
 notify = "5.0.0"
 num_cpus = "1.15"
@@ -80,18 +81,18 @@ reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
 routerify = "3"
-rpds = "0.12.0"
+rpds = "0.13"
 rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
-sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
-socket2 = "0.4.4"
+socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 svg_fmt = "0.4.1"
@@ -106,27 +107,29 @@ tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
 tokio-util = { version = "0.7", features = ["io"] }
-toml = "0.5"
-toml_edit = { version = "0.17", features = ["easy"] }
-tonic = {version = "0.8", features = ["tls", "tls-roots"]}
+toml = "0.7"
+toml_edit = "0.19"
+tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
+tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.22.5"
-x509-parser = "0.14"
+webpki-roots = "0.23"
+x509-parser = "0.15"

 ## TODO replace this with tracing
 env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
@@ -154,14 +157,20 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 ## Build dependencies
 criterion = "0.4"
 rcgen = "0.10"
-rstest = "0.16"
+rstest = "0.17"
 tempfile = "3.4"
-tonic-build = "0.8"
+tonic-build = "0.9"
+
+[patch.crates-io]

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-[patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+
+# Changes the MAX_THREADS limit from 4096 to 32768.
+# This is a temporary workaround for using tracing from many threads in safekeepers code,
+# until async safekeepers patch is merged to the main.
+sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }

 ################# Binary contents sections

--- a/11
+++ b/11
@@ -44,7 +44,15 @@ COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin storage_broker --bin proxy --locked --release \
+    && mold -run cargo build  \
+      --bin pg_sni_router  \
+      --bin pageserver  \
+      --bin pageserver_binutils  \
+      --bin draw_timeline_dir \
+      --bin safekeeper  \
+      --bin storage_broker  \
+      --bin proxy  \
+      --locked --release \
    && cachepot -s

 # Build final image
@@ -63,6 +71,7 @@ RUN set -e \
    && useradd -d /data neon \
    && chown -R neon:neon /data

+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router       /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -12,7 +12,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev

 #########################################################################################
 #
@@ -24,8 +24,13 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
-    --with-libxml --with-libxslt && \
+    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
+    --with-icu --with-libxml --with-libxslt --with-lz4" && \
+    if [ "${PG_VERSION}" != "v14" ]; then \
+        # zstd is available only from PG15
+        export CONFIGURE_CMD="${CONFIGURE_CMD} --with-zstd"; \
+    fi && \
+    eval $CONFIGURE_CMD && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
    # Install headers
@@ -565,13 +570,17 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
+# liblz4-1 for lz4
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
+# libzstd1 for zstd
 RUN apt update &&  \
    apt install --no-install-recommends -y \
+        gdb \
        locales \
        libicu67 \
+        liblz4-1 \
        libreadline8 \
        libossp-uuid16 \
        libgeos-c1v5 \
@@ -581,7 +590,8 @@ RUN apt update &&  \
        libsfcgal1 \
        libxml2 \
        libxslt1.1 \
-        gdb && \
+        libzstd1 \
+        procps && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/Dockerfile.vm-compute-node
+++ b/Dockerfile.vm-compute-node
@@ -54,7 +54,7 @@ RUN set -e \

 RUN set -e \
 	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
-	&& CONNSTR="dbname=neondb user=cloud_admin sslmode=disable" \
+	&& CONNSTR="dbname=postgres user=cloud_admin sslmode=disable" \
 	&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
 	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -73,7 +73,7 @@ fn main() -> Result<()> {
    // Try to use just 'postgres' if no path is provided
    let pgbin = matches.get_one::<String>("pgbin").unwrap();

-    let mut spec = None;
+    let spec;
    let mut live_config_allowed = false;
    match spec_json {
        // First, try to get cluster spec from the cli argument
@@ -89,9 +89,13 @@ fn main() -> Result<()> {
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
-                    if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
-                        spec = Some(s);
-                    }
+                    spec = match get_spec_from_control_plane(cp_base, id) {
+                        Ok(s) => s,
+                        Err(e) => {
+                            error!("cannot get response from control plane: {}", e);
+                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
+                        }
+                    };
                } else {
                    panic!("must specify both --control-plane-uri and --compute-id or none");
                }
@@ -114,7 +118,6 @@ fn main() -> Result<()> {
        spec_set = false;
    }
    let compute_node = ComputeNode {
-        start_time: Utc::now(),
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
@@ -147,6 +150,17 @@ fn main() -> Result<()> {
    let mut state = compute.state.lock().unwrap();
    let pspec = state.pspec.as_ref().expect("spec must be set");
    let startup_tracing_context = pspec.spec.startup_tracing_context.clone();
+
+    // Record for how long we slept waiting for the spec.
+    state.metrics.wait_for_spec_ms = Utc::now()
+        .signed_duration_since(state.start_time)
+        .to_std()
+        .unwrap()
+        .as_millis() as u64;
+    // Reset start time to the actual start of the configuration, so that
+    // total startup time was properly measured at the end.
+    state.start_time = Utc::now();
+
    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
    drop(state);
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,12 +1,28 @@
 use anyhow::{anyhow, Result};
-use postgres::Client;
 use tokio_postgres::NoTls;
 use tracing::{error, instrument};

 use crate::compute::ComputeNode;

+/// Update timestamp in a row in a special service table to check
+/// that we can actually write some data in this particular timeline.
+/// Create table if it's missing.
 #[instrument(skip_all)]
-pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
+pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
+    // Connect to the database.
+    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
+    if client.is_closed() {
+        return Err(anyhow!("connection to postgres closed"));
+    }
+
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
+    });
+
    let query = "
    CREATE TABLE IF NOT EXISTS health_check (
        id serial primary key,
@@ -15,31 +31,15 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
    INSERT INTO health_check VALUES (1, now())
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";
-    let result = client.simple_query(query)?;
-    if result.len() < 2 {
-        return Err(anyhow::format_err!("executed  {} queries", result.len()));
-    }
-    Ok(())
-}
-
-#[instrument(skip_all)]
-pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
-    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
-    if client.is_closed() {
-        return Err(anyhow!("connection to postgres closed"));
-    }
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
-    });
-
-    let result = client
-        .simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;")
-        .await?;
-
-    if result.len() != 1 {
-        return Err(anyhow!("statement can't be executed"));
+
+    let result = client.simple_query(query).await?;
+
+    if result.len() != 2 {
+        return Err(anyhow::format_err!(
+            "expected 2 query results, but got {}",
+            result.len()
+        ));
    }
+
    Ok(())
 }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -30,16 +30,14 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::ComputeSpec;
+use compute_api::spec::{ComputeMode, ComputeSpec};

-use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
-    pub start_time: DateTime<Utc>,
    // Url type maintains proper escaping
    pub connstr: url::Url,
    pub pgdata: String,
@@ -67,9 +65,11 @@ pub struct ComputeNode {

 #[derive(Clone, Debug)]
 pub struct ComputeState {
+    pub start_time: DateTime<Utc>,
    pub status: ComputeStatus,
-    /// Timestamp of the last Postgres activity
-    pub last_active: DateTime<Utc>,
+    /// Timestamp of the last Postgres activity. It could be `None` if
+    /// compute wasn't used since start.
+    pub last_active: Option<DateTime<Utc>>,
    pub error: Option<String>,
    pub pspec: Option<ParsedSpec>,
    pub metrics: ComputeMetrics,
@@ -78,8 +78,9 @@ pub struct ComputeState {
 impl ComputeState {
    pub fn new() -> Self {
        Self {
+            start_time: Utc::now(),
            status: ComputeStatus::Empty,
-            last_active: Utc::now(),
+            last_active: None,
            error: None,
            pspec: None,
            metrics: ComputeMetrics::default(),
@@ -250,40 +251,54 @@ impl ComputeNode {
    #[instrument(skip(self, compute_state))]
    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;

-        let start_lsn = if let Some(lsn) = pspec.spec.start_lsn {
-            lsn
-        } else {
-            info!("starting safekeepers syncing");
-            let last_lsn = self
-                .sync_safekeepers(pspec.storage_auth_token.clone())
-                .with_context(|| "failed to sync safekeepers")?;
-            info!("safekeepers synced at LSN {}", last_lsn);
-            last_lsn
+        // Syncing safekeepers is only safe with primary nodes: if a primary
+        // is already connected it will be kicked out, so a secondary (standby)
+        // cannot sync safekeepers.
+        let lsn = match spec.mode {
+            ComputeMode::Primary => {
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
+                info!("safekeepers synced at LSN {}", lsn);
+                lsn
+            }
+            ComputeMode::Static(lsn) => {
+                info!("Starting read-only node at static LSN {}", lsn);
+                lsn
+            }
+            ComputeMode::Replica => {
+                info!("Initializing standby from latest Pageserver LSN");
+                Lsn(0)
+            }
        };

        info!(
            "getting basebackup@{} from pageserver {}",
-            start_lsn, &pspec.pageserver_connstr
+            lsn, &pspec.pageserver_connstr
        );
-        self.get_basebackup(compute_state, start_lsn)
-            .with_context(|| {
-                format!(
-                    "failed to get basebackup@{} from pageserver {}",
-                    start_lsn, &pspec.pageserver_connstr
-                )
-            })?;
+        self.get_basebackup(compute_state, lsn).with_context(|| {
+            format!(
+                "failed to get basebackup@{} from pageserver {}",
+                lsn, &pspec.pageserver_connstr
+            )
+        })?;

        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;

-        if pspec.spec.standby {
-            std::fs::File::create(pgdata_path.join("standby.signal"))?;
+        match spec.mode {
+            ComputeMode::Primary | ComputeMode::Static(..) => {}
+            ComputeMode::Replica => {
+                add_standby_signal(pgdata_path)?;
+            }
        }

        Ok(())
@@ -352,7 +367,6 @@ impl ComputeNode {
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, self.connstr.as_str(), &mut client)?;
-        create_writability_check_data(&mut client)?;
        handle_extensions(spec, &mut client)?;

        // 'Close' connection
@@ -389,11 +403,13 @@ impl ComputeNode {
        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
-        handle_roles(&spec, &mut client)?;
-        handle_databases(&spec, &mut client)?;
-        handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(&spec, self.connstr.as_str(), &mut client)?;
-        handle_extensions(&spec, &mut client)?;
+        if spec.mode == ComputeMode::Primary {
+            handle_roles(&spec, &mut client)?;
+            handle_databases(&spec, &mut client)?;
+            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
+            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
+            handle_extensions(&spec, &mut client)?;
+        }

        // 'Close' connection
        drop(client);
@@ -426,7 +442,7 @@ impl ComputeNode {

        let pg = self.start_postgres(spec.storage_auth_token.clone())?;

-        if !spec.spec.standby {
+        if spec.spec.mode == ComputeMode::Primary {
            self.apply_config(&compute_state)?;
        }

@@ -439,7 +455,7 @@ impl ComputeNode {
                .unwrap()
                .as_millis() as u64;
            state.metrics.total_startup_ms = startup_end_time
-                .signed_duration_since(self.start_time)
+                .signed_duration_since(compute_state.start_time)
                .to_std()
                .unwrap()
                .as_millis() as u64;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,7 +6,7 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::PgOptionsSerialize;
-use compute_api::spec::ComputeSpec;
+use compute_api::spec::{ComputeMode, ComputeSpec};

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -34,17 +34,25 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 /// Create or completely rewrite configuration file specified by `path`
 pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
-    let mut postgres_conf = File::create(path)?;
+    let mut file = File::create(path)?;

-    write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
-
-    Ok(())
-}
-
-// Write Postgres config block wrapped with generated comment section
-fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
    writeln!(file, "# Managed by compute_ctl: begin")?;
-    writeln!(file, "{}", buf)?;
+
+    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
+
+    match spec.mode {
+        ComputeMode::Primary => {}
+        ComputeMode::Static(lsn) => {
+            // hot_standby is 'on' by default, but let's be explicit
+            writeln!(file, "hot_standby=on")?;
+            writeln!(file, "recovery_target_lsn='{lsn}'")?;
+        }
+        ComputeMode::Replica => {
+            // hot_standby is 'on' by default, but let's be explicit
+            writeln!(file, "hot_standby=on")?;
+        }
+    }
+
    writeln!(file, "# Managed by compute_ctl: end")?;

    Ok(())
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -18,6 +18,7 @@ use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
    ComputeStatusResponse {
+        start_time: state.start_time,
        tenant: state
            .pspec
            .as_ref()
@@ -85,7 +86,10 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            let res = crate::checker::check_writability(compute).await;
            match res {
                Ok(_) => Response::new(Body::from("true")),
-                Err(e) => Response::new(Body::from(e.to_string())),
+                Err(e) => {
+                    error!("check_writability failed: {}", e);
+                    Response::new(Body::from(e.to_string()))
+                }
            }
        }

--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -152,11 +152,14 @@ components:
      type: object
      description: Compute startup metrics.
      required:
+        - wait_for_spec_ms
        - sync_safekeepers_ms
        - basebackup_ms
        - config_ms
        - total_startup_ms
      properties:
+        wait_for_spec_ms:
+          type: integer
        sync_safekeepers_ms:
          type: integer
        basebackup_ms:
@@ -178,18 +181,27 @@ components:
    ComputeState:
      type: object
      required:
+        - start_time
        - status
-        - last_active
      properties:
+        start_time:
+          type: string
+          description: |
+            Time when compute was started. If initially compute was started in the `empty`
+            state and then provided with valid spec, `start_time` will be reset to the
+            moment, when spec was received.
+          example: "2022-10-12T07:20:50.52Z"
        status:
          $ref: '#/components/schemas/ComputeStatus'
        last_active:
          type: string
-          description: The last detected compute activity timestamp in UTC and RFC3339 format.
+          description: |
+            The last detected compute activity timestamp in UTC and RFC3339 format.
+            It could be empty if compute was never used by user since start.
          example: "2022-10-12T07:20:50.52Z"
        error:
          type: string
-          description: Text of the error during compute startup, if any.
+          description: Text of the error during compute startup or reconfiguration, if any.
          example: ""
        tenant:
          type: string
@@ -212,9 +224,12 @@ components:
    ComputeStatus:
      type: string
      enum:
+        - empty
        - init
        - failed
        - running
+        - configuration_pending
+        - configuration
      example: running

    #
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -74,7 +74,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                            // Found non-idle backend, so the last activity is NOW.
                            // Save it and exit the for loop. Also clear the idle backend
                            // `state_change` timestamps array as it doesn't matter now.
-                            last_active = Utc::now();
+                            last_active = Some(Utc::now());
                            idle_backs.clear();
                            break;
                        }
@@ -82,15 +82,16 @@ fn watch_compute_activity(compute: &ComputeNode) {

                    // Get idle backend `state_change` with the max timestamp.
                    if let Some(last) = idle_backs.iter().max() {
-                        last_active = *last;
+                        last_active = Some(*last);
                    }
                }

                // Update the last activity in the shared state if we got a more recent one.
                let mut state = compute.state.lock().unwrap();
+                // NB: `Some(<DateTime>)` is always greater than `None`.
                if last_active > state.last_active {
                    state.last_active = last_active;
-                    debug!("set the last compute activity time to: {}", last_active);
+                    debug!("set the last compute activity time to: {:?}", last_active);
                }
            }
            Err(e) => {
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -94,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions {

 pub trait GenericOptionsSearch {
    fn find(&self, name: &str) -> Option<String>;
+    fn find_ref(&self, name: &str) -> Option<&GenericOption>;
 }

 impl GenericOptionsSearch for GenericOptions {
@@ -103,6 +104,12 @@ impl GenericOptionsSearch for GenericOptions {
        let op = ops.iter().find(|s| s.name == name)?;
        op.value.clone()
    }
+
+    /// Lookup option by name, returning ref
+    fn find_ref(&self, name: &str) -> Option<&GenericOption> {
+        let ops = self.as_ref()?;
+        ops.iter().find(|s| s.name == name)
+    }
 }

 pub trait RoleExt {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,45 +1,121 @@
+use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;

 use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
-use tracing::{info, info_span, instrument, span_enabled, warn, Level};
+use reqwest::StatusCode;
+use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

-use compute_api::responses::ControlPlaneSpecResponse;
+use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
 use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};

+// Do control plane request and return response if any. In case of error it
+// returns a bool flag indicating whether it makes sense to retry the request
+// and a string with error message.
+fn do_control_plane_request(
+    uri: &str,
+    jwt: &str,
+) -> Result<ControlPlaneSpecResponse, (bool, String)> {
+    let resp = reqwest::blocking::Client::new()
+        .get(uri)
+        .header("Authorization", jwt)
+        .send()
+        .map_err(|e| {
+            (
+                true,
+                format!("could not perform spec request to control plane: {}", e),
+            )
+        })?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+            Ok(spec_resp) => Ok(spec_resp),
+            Err(e) => Err((
+                true,
+                format!("could not deserialize control plane response: {}", e),
+            )),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => {
+            Err((true, "control plane is temporarily unavailable".to_string()))
+        }
+        StatusCode::BAD_GATEWAY => {
+            // We have a problem with intermittent 502 errors now
+            // https://github.com/neondatabase/cloud/issues/2353
+            // It's fine to retry GET request in this case.
+            Err((true, "control plane request failed with 502".to_string()))
+        }
+        // Another code, likely 500 or 404, means that compute is unknown to the control plane
+        // or some internal failure happened. Doesn't make much sense to retry in this case.
+        _ => Err((
+            false,
+            format!(
+                "unexpected control plane response status code: {}",
+                resp.status()
+            ),
+        )),
+    }
+}
+
 /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
 /// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
+pub fn get_spec_from_control_plane(
+    base_uri: &str,
+    compute_id: &str,
+) -> Result<Option<ComputeSpec>> {
    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
+    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
    };
+    let mut attempt = 1;
+    let mut spec: Result<Option<ComputeSpec>> = Ok(None);
+
    info!("getting spec from control plane: {}", cp_uri);

-    // TODO: check the response. We should distinguish cases when it's
-    // - network error, then retry
-    // - no spec for compute yet, then wait
-    // - compute id is unknown or any other error, then bail out
-    let resp: ControlPlaneSpecResponse = reqwest::blocking::Client::new()
-        .get(cp_uri)
-        .header("Authorization", jwt)
-        .send()
-        .map_err(|e| anyhow!("could not send spec request to control plane: {}", e))?
-        .json()
-        .map_err(|e| anyhow!("could not get compute spec from control plane: {}", e))?;
+    // Do 3 attempts to get spec from the control plane using the following logic:
+    // - network error -> then retry
+    // - compute id is unknown or any other error -> bail out
+    // - no spec for compute yet (Empty state) -> return Ok(None)
+    // - got spec -> return Ok(Some(spec))
+    while attempt < 4 {
+        spec = match do_control_plane_request(&cp_uri, &jwt) {
+            Ok(spec_resp) => match spec_resp.status {
+                ControlPlaneComputeStatus::Empty => Ok(None),
+                ControlPlaneComputeStatus::Attached => {
+                    if let Some(spec) = spec_resp.spec {
+                        Ok(Some(spec))
+                    } else {
+                        bail!("compute is attached, but spec is empty")
+                    }
+                }
+            },
+            Err((retry, msg)) => {
+                if retry {
+                    Err(anyhow!(msg))
+                } else {
+                    bail!(msg);
+                }
+            }
+        };

-    if let Some(spec) = resp.spec {
-        Ok(spec)
-    } else {
-        bail!("could not get compute spec from control plane")
+        if let Err(e) = &spec {
+            error!("attempt {} to get spec failed with: {}", attempt, e);
+        } else {
+            return spec;
+        }
+
+        attempt += 1;
+        std::thread::sleep(std::time::Duration::from_millis(100));
    }
+
+    // All attempts failed, return error.
+    spec
 }

 /// It takes cluster specification and does the following:
@@ -70,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

+/// Create a standby.signal file
+pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
+    // XXX: consider making it a part of spec.json
+    info!("adding standby.signal");
+    let signalfile = pgdata_path.join("standby.signal");
+
+    if !signalfile.exists() {
+        info!("created standby.signal");
+        File::create(signalfile)?;
+    } else {
+        info!("reused pre-existing standby.signal");
+    }
+    Ok(())
+}
+
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -30,4 +30,5 @@ postgres_connection.workspace = true
 storage_broker.workspace = true
 utils.workspace = true

+compute_api.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -7,6 +7,7 @@
 //!
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
+use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
@@ -474,7 +475,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;

            println!("Creating endpoint for imported timeline ...");
-            cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?;
+            cplane.new_endpoint(
+                tenant_id,
+                name,
+                timeline_id,
+                None,
+                pg_version,
+                ComputeMode::Primary,
+            )?;
            println!("Done");
        }
        Some(("branch", branch_match)) => {
@@ -560,20 +568,20 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .iter()
                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
            {
-                let lsn_str = match endpoint.lsn {
-                    None => {
-                        // -> primary endpoint
+                let lsn_str = match endpoint.mode {
+                    ComputeMode::Static(lsn) => {
+                        // -> read-only endpoint
+                        // Use the node's LSN.
+                        lsn.to_string()
+                    }
+                    _ => {
+                        // -> primary endpoint or hot replica
                        // Use the LSN at the end of the timeline.
                        timeline_infos
                            .get(&endpoint.timeline_id)
                            .map(|bi| bi.last_record_lsn.to_string())
                            .unwrap_or_else(|| "?".to_string())
                    }
-                    Some(lsn) => {
-                        // -> read-only endpoint
-                        // Use the endpoint's LSN.
-                        lsn.to_string()
-                    }
                };

                let branch_name = timeline_name_mappings
@@ -619,7 +627,19 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?;
+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
+            let mode = match (lsn, hot_standby) {
+                (Some(lsn), false) => ComputeMode::Static(lsn),
+                (None, true) => ComputeMode::Replica,
+                (None, false) => ComputeMode::Primary,
+                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+            };
+
+            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
        }
        "start" => {
            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
@@ -637,7 +657,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                None
            };

+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
            if let Some(endpoint) = endpoint {
+                match (&endpoint.mode, hot_standby) {
+                    (ComputeMode::Static(_), true) => {
+                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
+                    }
+                    (ComputeMode::Primary, true) => {
+                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                    }
+                    _ => {}
+                }
                println!("Starting existing endpoint {endpoint_id}...");
                endpoint.start(&auth_token)?;
            } else {
@@ -659,6 +693,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    .get_one::<u32>("pg-version")
                    .copied()
                    .context("Failed to `pg-version` from the argument string")?;
+
+                let mode = match (lsn, hot_standby) {
+                    (Some(lsn), false) => ComputeMode::Static(lsn),
+                    (None, true) => ComputeMode::Replica,
+                    (None, false) => ComputeMode::Primary,
+                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+                };
+
                // when used with custom port this results in non obvious behaviour
                // port is remembered from first start command, i e
                // start --port X
@@ -670,9 +712,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    tenant_id,
                    endpoint_id,
                    timeline_id,
-                    lsn,
                    port,
                    pg_version,
+                    mode,
                )?;
                ep.start(&auth_token)?;
            }
@@ -928,6 +970,12 @@ fn cli() -> Command {
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
        .required(false);

+    let hot_standby_arg = Arg::new("hot-standby")
+        .value_parser(value_parser!(bool))
+        .long("hot-standby")
+        .help("If set, the node will be a hot replica on the specified timeline")
+        .required(false);
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1052,6 +1100,7 @@ fn cli() -> Command {
                            .long("config-only")
                            .required(false))
                    .arg(pg_version_arg.clone())
+                    .arg(hot_standby_arg.clone())
                )
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1062,6 +1111,7 @@ fn cli() -> Command {
                    .arg(lsn_arg)
                    .arg(port_arg)
                    .arg(pg_version_arg)
+                    .arg(hot_standby_arg)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -11,15 +11,33 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

-use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
+use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

+use compute_api::spec::ComputeMode;
+
+// contents of a endpoint.json file
+#[serde_as]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+pub struct EndpointConf {
+    name: String,
+    #[serde_as(as = "DisplayFromStr")]
+    tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    timeline_id: TimelineId,
+    mode: ComputeMode,
+    port: u16,
+    pg_version: u32,
+}
+
 //
 // ComputeControlPlane
 //
@@ -68,23 +86,34 @@ impl ComputeControlPlane {
        tenant_id: TenantId,
        name: &str,
        timeline_id: TimelineId,
-        lsn: Option<Lsn>,
        port: Option<u16>,
        pg_version: u32,
+        mode: ComputeMode,
    ) -> Result<Arc<Endpoint>> {
        let port = port.unwrap_or_else(|| self.get_port());
+
        let ep = Arc::new(Endpoint {
            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            timeline_id,
-            lsn,
+            mode,
            tenant_id,
            pg_version,
        });
-
        ep.create_pgdata()?;
+        std::fs::write(
+            ep.endpoint_path().join("endpoint.json"),
+            serde_json::to_string_pretty(&EndpointConf {
+                name: name.to_string(),
+                tenant_id,
+                timeline_id,
+                mode,
+                port,
+                pg_version,
+            })?,
+        )?;
        ep.setup_pg_conf()?;

        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
@@ -101,8 +130,7 @@ pub struct Endpoint {
    name: String,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary.
-    pub lsn: Option<Lsn>,
+    pub mode: ComputeMode,

    // port and address of the Postgres server
    pub address: SocketAddr,
@@ -131,42 +159,20 @@ impl Endpoint {
        let fname = entry.file_name();
        let name = fname.to_str().unwrap().to_string();

-        // Read config file into memory
-        let cfg_path = entry.path().join("pgdata").join("postgresql.conf");
-        let cfg_path_str = cfg_path.to_string_lossy();
-        let mut conf_file = File::open(&cfg_path)
-            .with_context(|| format!("failed to open config file in {}", cfg_path_str))?;
-        let conf = PostgresConf::read(&mut conf_file)
-            .with_context(|| format!("failed to read config file in {}", cfg_path_str))?;
-
-        // Read a few options from the config file
-        let context = format!("in config file {}", cfg_path_str);
-        let port: u16 = conf.parse_field("port", &context)?;
-        let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
-        let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
-
-        // Read postgres version from PG_VERSION file to determine which postgres version binary to use.
-        // If it doesn't exist, assume broken data directory and use default pg version.
-        let pg_version_path = entry.path().join("PG_VERSION");
-
-        let pg_version_str =
-            fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string());
-        let pg_version = u32::from_str(&pg_version_str)?;
-
-        // parse recovery_target_lsn, if any
-        let recovery_target_lsn: Option<Lsn> =
-            conf.parse_field_optional("recovery_target_lsn", &context)?;
+        // Read the endpoint.json file
+        let conf: EndpointConf =
+            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

        // ok now
        Ok(Endpoint {
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
+            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
            name,
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
-            timeline_id,
-            lsn: recovery_target_lsn,
-            tenant_id,
-            pg_version,
+            timeline_id: conf.timeline_id,
+            mode: conf.mode,
+            tenant_id: conf.tenant_id,
+            pg_version: conf.pg_version,
        })
    }

@@ -299,50 +305,83 @@ impl Endpoint {
        conf.append("neon.pageserver_connstring", &pageserver_connstr);
        conf.append("neon.tenant_id", &self.tenant_id.to_string());
        conf.append("neon.timeline_id", &self.timeline_id.to_string());
-        if let Some(lsn) = self.lsn {
-            conf.append("recovery_target_lsn", &lsn.to_string());
-        }

        conf.append_line("");
-        // Configure backpressure
-        // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
-        //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
-        //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
-        //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
-        //   updates pages are not requested from pageserver.
-        // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
-        //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
-        //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
-        //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
-        // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
-        //   To be able to restore database in case of pageserver node crash, safekeeper should not
-        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
-        //   (if they are not able to upload WAL to S3).
-        conf.append("max_replication_write_lag", "15MB");
-        conf.append("max_replication_flush_lag", "10GB");
+        // Replication-related configurations, such as WAL sending
+        match &self.mode {
+            ComputeMode::Primary => {
+                // Configure backpressure
+                // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
+                //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
+                //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
+                //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
+                //   updates pages are not requested from pageserver.
+                // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
+                //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
+                //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
+                //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
+                // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
+                //   To be able to restore database in case of pageserver node crash, safekeeper should not
+                //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
+                //   (if they are not able to upload WAL to S3).
+                conf.append("max_replication_write_lag", "15MB");
+                conf.append("max_replication_flush_lag", "10GB");

-        if !self.env.safekeepers.is_empty() {
-            // Configure Postgres to connect to the safekeepers
-            conf.append("synchronous_standby_names", "walproposer");
+                if !self.env.safekeepers.is_empty() {
+                    // Configure Postgres to connect to the safekeepers
+                    conf.append("synchronous_standby_names", "walproposer");

-            let safekeepers = self
-                .env
-                .safekeepers
-                .iter()
-                .map(|sk| format!("localhost:{}", sk.pg_port))
-                .collect::<Vec<String>>()
-                .join(",");
-            conf.append("neon.safekeepers", &safekeepers);
-        } else {
-            // We only use setup without safekeepers for tests,
-            // and don't care about data durability on pageserver,
-            // so set more relaxed synchronous_commit.
-            conf.append("synchronous_commit", "remote_write");
+                    let safekeepers = self
+                        .env
+                        .safekeepers
+                        .iter()
+                        .map(|sk| format!("localhost:{}", sk.pg_port))
+                        .collect::<Vec<String>>()
+                        .join(",");
+                    conf.append("neon.safekeepers", &safekeepers);
+                } else {
+                    // We only use setup without safekeepers for tests,
+                    // and don't care about data durability on pageserver,
+                    // so set more relaxed synchronous_commit.
+                    conf.append("synchronous_commit", "remote_write");

-            // Configure the node to stream WAL directly to the pageserver
-            // This isn't really a supported configuration, but can be useful for
-            // testing.
-            conf.append("synchronous_standby_names", "pageserver");
+                    // Configure the node to stream WAL directly to the pageserver
+                    // This isn't really a supported configuration, but can be useful for
+                    // testing.
+                    conf.append("synchronous_standby_names", "pageserver");
+                }
+            }
+            ComputeMode::Static(lsn) => {
+                conf.append("recovery_target_lsn", &lsn.to_string());
+            }
+            ComputeMode::Replica => {
+                assert!(!self.env.safekeepers.is_empty());
+
+                // TODO: use future host field from safekeeper spec
+                // Pass the list of safekeepers to the replica so that it can connect to any of them,
+                // whichever is availiable.
+                let sk_ports = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .map(|x| x.pg_port.to_string())
+                    .collect::<Vec<_>>()
+                    .join(",");
+                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
+
+                let connstr = format!(
+                    "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true",
+                    sk_hosts,
+                    sk_ports,
+                    &self.timeline_id.to_string(),
+                    &self.tenant_id.to_string(),
+                );
+
+                let slot_name = format!("repl_{}_", self.timeline_id);
+                conf.append("primary_conninfo", connstr.as_str());
+                conf.append("primary_slot_name", slot_name.as_str());
+                conf.append("hot_standby", "on");
+            }
        }

        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
@@ -355,21 +394,27 @@ impl Endpoint {
    }

    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
-        let backup_lsn = if let Some(lsn) = self.lsn {
-            Some(lsn)
-        } else if !self.env.safekeepers.is_empty() {
-            // LSN 0 means that it is bootstrap and we need to download just
-            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
-            // procedure evolves quite actively right now, so let's think about it again
-            // when things would be more stable (TODO).
-            let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
-            if lsn == Lsn(0) {
-                None
-            } else {
-                Some(lsn)
+        let backup_lsn = match &self.mode {
+            ComputeMode::Primary => {
+                if !self.env.safekeepers.is_empty() {
+                    // LSN 0 means that it is bootstrap and we need to download just
+                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
+                    // procedure evolves quite actively right now, so let's think about it again
+                    // when things would be more stable (TODO).
+                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
+                    if lsn == Lsn(0) {
+                        None
+                    } else {
+                        Some(lsn)
+                    }
+                } else {
+                    None
+                }
+            }
+            ComputeMode::Static(lsn) => Some(*lsn),
+            ComputeMode::Replica => {
+                None // Take the latest snapshot available to start with
            }
-        } else {
-            None
        };

        self.do_basebackup(backup_lsn)?;
@@ -466,7 +511,7 @@ impl Endpoint {
        // 3. Load basebackup
        self.load_basebackup(auth_token)?;

-        if self.lsn.is_some() {
+        if self.mode != ComputeMode::Primary {
            File::create(self.pgdata().join("standby.signal"))?;
        }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -359,8 +359,8 @@ impl PageServerNode {
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
            eviction_policy: settings
-                .get("eviction_policy")
-                .map(|x| serde_json::from_str(x))
+                .remove("eviction_policy")
+                .map(serde_json::from_str)
                .transpose()
                .context("Failed to parse 'eviction_policy' json")?,
            min_resident_size_override: settings
@@ -368,6 +368,9 @@ impl PageServerNode {
                .map(|x| x.parse::<u64>())
                .transpose()
                .context("Failed to parse 'min_resident_size_override' as integer")?,
+            evictions_low_residence_duration_metric_threshold: settings
+                .remove("evictions_low_residence_duration_metric_threshold")
+                .map(|x| x.to_string()),
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -445,6 +448,9 @@ impl PageServerNode {
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'min_resident_size_override' as an integer")?,
+                evictions_low_residence_duration_metric_threshold: settings
+                    .get("evictions_low_residence_duration_metric_threshold")
+                    .map(|x| x.to_string()),
            })
            .send()?
            .error_from_body()?;
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -13,7 +13,7 @@ use std::io::BufRead;
 use std::str::FromStr;

 /// In-memory representation of a postgresql.conf file
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct PostgresConf {
    lines: Vec<String>,
    hash: HashMap<String, String>,
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -28,11 +28,6 @@
                "value": "replica",
                "vartype": "enum"
            },
-            {
-                "name": "hot_standby",
-                "value": "on",
-                "vartype": "bool"
-            },
            {
                "name": "wal_log_hints",
                "value": "on",
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,11 +14,12 @@ pub struct GenericAPIError {
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
+    pub start_time: DateTime<Utc>,
    pub tenant: Option<String>,
    pub timeline: Option<String>,
    pub status: ComputeStatus,
    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: DateTime<Utc>,
+    pub last_active: Option<DateTime<Utc>>,
    pub error: Option<String>,
 }

@@ -28,7 +29,7 @@ pub struct ComputeState {
    pub status: ComputeStatus,
    /// Timestamp of the last Postgres activity
    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: DateTime<Utc>,
+    pub last_active: Option<DateTime<Utc>>,
    pub error: Option<String>,
 }

@@ -53,16 +54,21 @@ pub enum ComputeStatus {
    Failed,
 }

-fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
+fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
 where
    S: Serializer,
 {
-    x.to_rfc3339().serialize(s)
+    if let Some(x) = x {
+        x.to_rfc3339().serialize(s)
+    } else {
+        s.serialize_none()
+    }
 }

 /// Response of the /metrics.json API
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
+    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
    pub basebackup_ms: u64,
    pub config_ms: u64,
@@ -75,4 +81,16 @@ pub struct ComputeMetrics {
 #[derive(Deserialize, Debug)]
 pub struct ControlPlaneSpecResponse {
    pub spec: Option<ComputeSpec>,
+    pub status: ControlPlaneComputeStatus,
+}
+
+#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ControlPlaneComputeStatus {
+    // Compute is known to control-plane, but it's not
+    // yet attached to any timeline / endpoint.
+    Empty,
+    // Compute is attached to some timeline / endpoint and
+    // should be able to start with provided spec.
+    Attached,
 }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -3,7 +3,7 @@
 //! The spec.json file is used to pass information to 'compute_ctl'. It contains
 //! all the information needed to start up the right version of PostgreSQL,
 //! and connect it to the storage nodes.
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use utils::lsn::Lsn;
@@ -27,22 +27,29 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

-    // If 'lsn' is set, start the compute at the given LSN.
-    // If 'lsn' is not set, compute_ctl performs "sync safekeepers" step first,
-    // and uses the last LSN on the timeline as the starting point.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub start_lsn: Option<Lsn>,
-
-    // If standy == true, a standby.signal file is created, putting Postgres
-    // into standby mode.
    #[serde(default)]
-    pub standby: bool,
+    pub mode: ComputeMode,

    pub storage_auth_token: Option<String>,

    pub startup_tracing_context: Option<HashMap<String, String>>,
 }

+#[serde_as]
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
+pub enum ComputeMode {
+    /// A read-write node
+    #[default]
+    Primary,
+    /// A read-only node, pinned at a particular LSN
+    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
+    /// A read-only node that follows the tip of the branch in hot standby mode
+    ///
+    /// Future versions may want to distinguish between replicas with hot standby
+    /// feedback and other kinds of replication configurations.
+    Replica,
+}
+
 #[derive(Clone, Debug, Default, Deserialize)]
 pub struct Cluster {
    pub cluster_id: String,
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -4,13 +4,12 @@ version = "0.1.0"
 edition = "2021"
 license = "Apache-2.0"

-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
 [dependencies]
-anyhow = "1.0.68"
-chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
-rand = "0.8.3"
-serde = "1.0.152"
-serde_with = "2.1.0"
-utils = { version = "0.1.0", path = "../utils" }
-workspace_hack = { version = "0.1.0", path = "../../workspace_hack" }
+anyhow.workspace = true
+chrono.workspace = true
+rand.workspace = true
+serde.workspace = true
+serde_with.workspace = true
+utils.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -135,6 +135,7 @@ pub struct TenantCreateRequest {
    // For now, this field is not even documented in the openapi_spec.yml.
    pub eviction_policy: Option<serde_json::Value>,
    pub min_resident_size_override: Option<u64>,
+    pub evictions_low_residence_duration_metric_threshold: Option<String>,
 }

 #[serde_as]
@@ -181,6 +182,7 @@ pub struct TenantConfigRequest {
    // For now, this field is not even documented in the openapi_spec.yml.
    pub eviction_policy: Option<serde_json::Value>,
    pub min_resident_size_override: Option<u64>,
+    pub evictions_low_residence_duration_metric_threshold: Option<String>,
 }

 impl TenantConfigRequest {
@@ -202,6 +204,7 @@ impl TenantConfigRequest {
            trace_read_requests: None,
            eviction_policy: None,
            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: None,
        }
    }
 }
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -50,11 +50,14 @@ impl QueryError {
    }
 }

+/// Returns true if the given error is a normal consequence of a network issue,
+/// or the client closing the connection. These errors can happen during normal
+/// operations, and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
    use io::ErrorKind::*;
    matches!(
        e.kind(),
-        ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
+        BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
    )
 }

--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -5,7 +5,7 @@ use std::path::PathBuf;
 use std::process::Command;

 use anyhow::{anyhow, Context};
-use bindgen::callbacks::ParseCallbacks;
+use bindgen::callbacks::{DeriveInfo, ParseCallbacks};

 #[derive(Debug)]
 struct PostgresFfiCallbacks;
@@ -20,7 +20,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {

    // Add any custom #[derive] attributes to the data structures that bindgen
    // creates.
-    fn add_derives(&self, name: &str) -> Vec<String> {
+    fn add_derives(&self, derive_info: &DeriveInfo) -> Vec<String> {
        // This is the list of data structures that we want to serialize/deserialize.
        let serde_list = [
            "XLogRecord",
@@ -31,7 +31,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
            "ControlFileData",
        ];

-        if serde_list.contains(&name) {
+        if serde_list.contains(&derive_info.name) {
            vec![
                "Default".into(), // Default allows us to easily fill the padding fields with 0.
                "Serialize".into(),
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -95,10 +95,13 @@ pub fn generate_wal_segment(
    segno: u64,
    system_id: u64,
    pg_version: u32,
+    lsn: Lsn,
 ) -> Result<Bytes, SerializeError> {
+    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));
+
    match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id),
+        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
        _ => Err(SerializeError::BadInput),
    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -195,6 +195,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;

 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
+pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

 /* From fsm_internals.h */
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -270,6 +270,11 @@ impl XLogPageHeaderData {
        use utils::bin_ser::LeSer;
        XLogPageHeaderData::des_from(&mut buf.reader())
    }
+
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
+        use utils::bin_ser::LeSer;
+        self.ser().map(|b| b.into())
+    }
 }

 impl XLogLongPageHeaderData {
@@ -328,22 +333,32 @@ impl CheckPoint {
    }
 }

-//
-// Generate new, empty WAL segment.
-// We need this segment to start compute node.
-//
-pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
+/// Generate new, empty WAL segment, with correct block headers at the first
+/// page of the segment and the page that contains the given LSN.
+/// We need this segment to start compute node.
+pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Bytes, SerializeError> {
    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);

    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+
+    let page_off = lsn.block_offset();
+    let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
+
+    let first_page_only = seg_off < XLOG_BLCKSZ;
+    let (shdr_rem_len, infoflags) = if first_page_only {
+        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
+    } else {
+        (0, 0)
+    };
+
    let hdr = XLogLongPageHeaderData {
        std: {
            XLogPageHeaderData {
                xlp_magic: XLOG_PAGE_MAGIC as u16,
-                xlp_info: pg_constants::XLP_LONG_HEADER,
+                xlp_info: pg_constants::XLP_LONG_HEADER | infoflags,
                xlp_tli: PG_TLI,
                xlp_pageaddr: pageaddr,
-                xlp_rem_len: 0,
+                xlp_rem_len: shdr_rem_len as u32,
                ..Default::default() // Put 0 in padding fields.
            }
        },
@@ -357,6 +372,33 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali

    //zero out the rest of the file
    seg_buf.resize(WAL_SEGMENT_SIZE, 0);
+
+    if !first_page_only {
+        let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
+        let header = XLogPageHeaderData {
+            xlp_magic: XLOG_PAGE_MAGIC as u16,
+            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                pg_constants::XLP_FIRST_IS_CONTRECORD
+            } else {
+                0
+            },
+            xlp_tli: PG_TLI,
+            xlp_pageaddr: lsn.page_lsn().0,
+            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                page_off as u32
+            } else {
+                0u32
+            },
+            ..Default::default() // Put 0 in padding fields.
+        };
+        let hdr_bytes = header.encode()?;
+
+        debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len());
+        debug_assert_ne!(block_offset, 0);
+
+        seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]);
+    }
+
    Ok(seg_buf.freeze())
 }

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,15 +1,13 @@
-use anyhow::*;
-use core::time::Duration;
+use anyhow::{bail, ensure};
 use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::cmp::Ordering;
-use std::fs;
 use std::path::{Path, PathBuf};
-use std::process::{Command, Stdio};
-use std::time::Instant;
+use std::process::Command;
+use std::time::{Duration, Instant};
 use tempfile::{tempdir, TempDir};

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -56,7 +54,7 @@ impl Conf {
        self.datadir.join("pg_wal")
    }

-    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
+    fn new_pg_command(&self, command: impl AsRef<Path>) -> anyhow::Result<Command> {
        let path = self.pg_bin_dir()?.join(command);
        ensure!(path.exists(), "Command {:?} does not exist", path);
        let mut cmd = Command::new(path);
@@ -66,7 +64,7 @@ impl Conf {
        Ok(cmd)
    }

-    pub fn initdb(&self) -> Result<()> {
+    pub fn initdb(&self) -> anyhow::Result<()> {
        if let Some(parent) = self.datadir.parent() {
            info!("Pre-creating parent directory {:?}", parent);
            // Tests may be run concurrently and there may be a race to create `test_output/`.
@@ -80,7 +78,7 @@ impl Conf {
        let output = self
            .new_pg_command("initdb")?
            .arg("-D")
-            .arg(self.datadir.as_os_str())
+            .arg(&self.datadir)
            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
            .output()?;
        debug!("initdb output: {:?}", output);
@@ -93,26 +91,18 @@ impl Conf {
        Ok(())
    }

-    pub fn start_server(&self) -> Result<PostgresServer> {
+    pub fn start_server(&self) -> anyhow::Result<PostgresServer> {
        info!("Starting Postgres server in {:?}", self.datadir);
-        let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| {
-            format!(
-                "Failed to create pg.log file in directory {}",
-                self.datadir.display()
-            )
-        })?;
        let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
        let unix_socket_dir_path = unix_socket_dir.path().to_owned();
        let server_process = self
            .new_pg_command("postgres")?
            .args(["-c", "listen_addresses="])
            .arg("-k")
-            .arg(unix_socket_dir_path.as_os_str())
+            .arg(&unix_socket_dir_path)
            .arg("-D")
-            .arg(self.datadir.as_os_str())
-            .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .arg(&self.datadir)
            .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
-            .stderr(Stdio::from(log_file))
            .spawn()?;
        let server = PostgresServer {
            process: server_process,
@@ -121,7 +111,7 @@ impl Conf {
                let mut c = postgres::Config::new();
                c.host_path(&unix_socket_dir_path);
                c.user("postgres");
-                c.connect_timeout(Duration::from_millis(1000));
+                c.connect_timeout(Duration::from_millis(10000));
                c
            },
        };
@@ -132,7 +122,7 @@ impl Conf {
        &self,
        first_segment_name: &str,
        last_segment_name: &str,
-    ) -> Result<std::process::Output> {
+    ) -> anyhow::Result<std::process::Output> {
        let first_segment_file = self.datadir.join(first_segment_name);
        let last_segment_file = self.datadir.join(last_segment_name);
        info!(
@@ -142,10 +132,7 @@ impl Conf {
        );
        let output = self
            .new_pg_command("pg_waldump")?
-            .args([
-                &first_segment_file.as_os_str(),
-                &last_segment_file.as_os_str(),
-            ])
+            .args([&first_segment_file, &last_segment_file])
            .output()?;
        debug!("waldump output: {:?}", output);
        Ok(output)
@@ -153,10 +140,9 @@ impl Conf {
 }

 impl PostgresServer {
-    pub fn connect_with_timeout(&self) -> Result<Client> {
+    pub fn connect_with_timeout(&self) -> anyhow::Result<Client> {
        let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap();
        while Instant::now() < retry_until {
-            use std::result::Result::Ok;
            if let Ok(client) = self.client_config.connect(postgres::NoTls) {
                return Ok(client);
            }
@@ -173,7 +159,6 @@ impl PostgresServer {

 impl Drop for PostgresServer {
    fn drop(&mut self) {
-        use std::result::Result::Ok;
        match self.process.try_wait() {
            Ok(Some(_)) => return,
            Ok(None) => {
@@ -188,12 +173,12 @@ impl Drop for PostgresServer {
 }

 pub trait PostgresClientExt: postgres::GenericClient {
-    fn pg_current_wal_insert_lsn(&mut self) -> Result<PgLsn> {
+    fn pg_current_wal_insert_lsn(&mut self) -> anyhow::Result<PgLsn> {
        Ok(self
            .query_one("SELECT pg_current_wal_insert_lsn()", &[])?
            .get(0))
    }
-    fn pg_current_wal_flush_lsn(&mut self) -> Result<PgLsn> {
+    fn pg_current_wal_flush_lsn(&mut self) -> anyhow::Result<PgLsn> {
        Ok(self
            .query_one("SELECT pg_current_wal_flush_lsn()", &[])?
            .get(0))
@@ -202,7 +187,7 @@ pub trait PostgresClientExt: postgres::GenericClient {

 impl<C: postgres::GenericClient> PostgresClientExt for C {}

-pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> {
+pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow::Result<()> {
    client.execute("create extension if not exists neon_test_utils", &[])?;

    let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0);
@@ -236,13 +221,13 @@ pub trait Crafter {
    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
    ///   May include or exclude Lsn(0) and the end-of-wal.
    /// * The expected end-of-wal LSN.
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)>;
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
 }

 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> Result<(Vec<PgLsn>, Option<PgLsn>)>,
-) -> Result<(Vec<PgLsn>, PgLsn)> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    ensure_server_config(client)?;

    let initial_lsn = client.pg_current_wal_insert_lsn()?;
@@ -274,7 +259,7 @@ fn craft_internal<C: postgres::GenericClient>(
 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
            Ok((Vec::new(), None))
@@ -285,7 +270,7 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -307,7 +292,7 @@ impl Crafter for LastWalRecordXlogSwitch {
 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -374,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
 fn craft_single_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> Result<(Vec<PgLsn>, PgLsn)> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -416,7 +401,7 @@ fn craft_single_logical_message(
 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_single_logical_message(client, true)
    }
 }
@@ -424,7 +409,7 @@ impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_single_logical_message(client, false)
    }
 }
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -10,7 +10,6 @@ byteorder.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-serde.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -6,15 +6,10 @@ pub mod framed;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use postgres_protocol::PG_EPOCH;
-use serde::{Deserialize, Serialize};
-use std::{
-    borrow::Cow,
-    collections::HashMap,
-    fmt, io, str,
-    time::{Duration, SystemTime},
-};
-use tracing::{trace, warn};
+use std::{borrow::Cow, collections::HashMap, fmt, io, str};
+
+// re-export for use in utils pageserver_feedback.rs
+pub use postgres_protocol::PG_EPOCH;

 pub type Oid = u32;
 pub type SystemId = u64;
@@ -618,7 +613,7 @@ pub struct XLogDataBody<'a> {

 #[derive(Debug)]
 pub struct WalSndKeepAlive {
-    pub sent_ptr: u64,
+    pub wal_end: u64, // current end of WAL on the server
    pub timestamp: i64,
    pub request_reply: bool,
 }
@@ -664,7 +659,7 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolErr
 }

 /// Read cstring from buf, advancing it.
-fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
+pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
    let pos = buf
        .iter()
        .position(|x| *x == 0)
@@ -929,7 +924,7 @@ impl<'a> BeMessage<'a> {
                buf.put_u8(b'd');
                write_body(buf, |buf| {
                    buf.put_u8(b'k');
-                    buf.put_u64(req.sent_ptr);
+                    buf.put_u64(req.wal_end);
                    buf.put_i64(req.timestamp);
                    buf.put_u8(u8::from(req.request_reply));
                });
@@ -939,175 +934,10 @@ impl<'a> BeMessage<'a> {
    }
 }

-/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
-/// Serialized in custom flexible key/value format. In replication protocol, it
-/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
-/// Standby status update / Hot standby feedback messages.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct PageserverFeedback {
-    /// Last known size of the timeline. Used to enforce timeline size limit.
-    pub current_timeline_size: u64,
-    /// LSN last received and ingested by the pageserver.
-    pub last_received_lsn: u64,
-    /// LSN up to which data is persisted by the pageserver to its local disc.
-    pub disk_consistent_lsn: u64,
-    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
-    /// consider WAL before it can be removed.
-    pub remote_consistent_lsn: u64,
-    pub replytime: SystemTime,
-}
-
-// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
-// Do not remove previously available fields because this might be backwards incompatible.
-pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
-
-impl PageserverFeedback {
-    pub fn empty() -> PageserverFeedback {
-        PageserverFeedback {
-            current_timeline_size: 0,
-            last_received_lsn: 0,
-            remote_consistent_lsn: 0,
-            disk_consistent_lsn: 0,
-            replytime: SystemTime::now(),
-        }
-    }
-
-    // Serialize PageserverFeedback using custom format
-    // to support protocol extensibility.
-    //
-    // Following layout is used:
-    // char - number of key-value pairs that follow.
-    //
-    // key-value pairs:
-    // null-terminated string - key,
-    // uint32 - value length in bytes
-    // value itself
-    //
-    // TODO: change serialized fields names once all computes migrate to rename.
-    pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
-        buf.put_slice(b"current_timeline_size\0");
-        buf.put_i32(8);
-        buf.put_u64(self.current_timeline_size);
-
-        buf.put_slice(b"ps_writelsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.last_received_lsn);
-        buf.put_slice(b"ps_flushlsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.disk_consistent_lsn);
-        buf.put_slice(b"ps_applylsn\0");
-        buf.put_i32(8);
-        buf.put_u64(self.remote_consistent_lsn);
-
-        let timestamp = self
-            .replytime
-            .duration_since(*PG_EPOCH)
-            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
-            .as_micros() as i64;
-
-        buf.put_slice(b"ps_replytime\0");
-        buf.put_i32(8);
-        buf.put_i64(timestamp);
-    }
-
-    // Deserialize PageserverFeedback message
-    // TODO: change serialized fields names once all computes migrate to rename.
-    pub fn parse(mut buf: Bytes) -> PageserverFeedback {
-        let mut rf = PageserverFeedback::empty();
-        let nfields = buf.get_u8();
-        for _ in 0..nfields {
-            let key = read_cstr(&mut buf).unwrap();
-            match key.as_ref() {
-                b"current_timeline_size" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.current_timeline_size = buf.get_u64();
-                }
-                b"ps_writelsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.last_received_lsn = buf.get_u64();
-                }
-                b"ps_flushlsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.disk_consistent_lsn = buf.get_u64();
-                }
-                b"ps_applylsn" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    rf.remote_consistent_lsn = buf.get_u64();
-                }
-                b"ps_replytime" => {
-                    let len = buf.get_i32();
-                    assert_eq!(len, 8);
-                    let raw_time = buf.get_i64();
-                    if raw_time > 0 {
-                        rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
-                    } else {
-                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
-                    }
-                }
-                _ => {
-                    let len = buf.get_i32();
-                    warn!(
-                        "PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
-                        String::from_utf8_lossy(key.as_ref())
-                    );
-                    buf.advance(len as usize);
-                }
-            }
-        }
-        trace!("PageserverFeedback parsed is {:?}", rf);
-        rf
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;

-    #[test]
-    fn test_replication_feedback_serialization() {
-        let mut rf = PageserverFeedback::empty();
-        // Fill rf with some values
-        rf.current_timeline_size = 12345678;
-        // Set rounded time to be able to compare it with deserialized value,
-        // because it is rounded up to microseconds during serialization.
-        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
-        let mut data = BytesMut::new();
-        rf.serialize(&mut data);
-
-        let rf_parsed = PageserverFeedback::parse(data.freeze());
-        assert_eq!(rf, rf_parsed);
-    }
-
-    #[test]
-    fn test_replication_feedback_unknown_key() {
-        let mut rf = PageserverFeedback::empty();
-        // Fill rf with some values
-        rf.current_timeline_size = 12345678;
-        // Set rounded time to be able to compare it with deserialized value,
-        // because it is rounded up to microseconds during serialization.
-        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
-        let mut data = BytesMut::new();
-        rf.serialize(&mut data);
-
-        // Add an extra field to the buffer and adjust number of keys
-        if let Some(first) = data.first_mut() {
-            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
-        }
-
-        data.put_slice(b"new_field_one\0");
-        data.put_i32(8);
-        data.put_u64(42);
-
-        // Parse serialized data and check that new field is not parsed
-        let rf_parsed = PageserverFeedback::parse(data.freeze());
-        assert_eq!(rf, rf_parsed);
-    }
-
    #[test]
    fn test_startup_message_params_options_escaped() {
        fn split_options(params: &StartupMessageParams) -> Vec<Cow<'_, str>> {
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -99,7 +99,11 @@ struct S3WithTestBlobs {
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3 {
    async fn setup() -> Self {
-        utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed");
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
            info!(
                "`{}` env variable is not set, skipping the test",
@@ -204,12 +208,7 @@ async fn upload_s3_data(
            let data = format!("remote blob data {i}").into_bytes();
            let data_len = data.len();
            task_client
-                .upload(
-                    Box::new(std::io::Cursor::new(data)),
-                    data_len,
-                    &blob_path,
-                    None,
-                )
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,4 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+workspace_hack.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -11,6 +11,7 @@ async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
+chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
@@ -27,14 +28,16 @@ signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
-tracing-subscriber = { workspace = true, features = ["json"] }
+tracing-error.workspace = true
+tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
 serde_with.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
-uuid = { version = "1.2", features = ["v4", "serde"] }
+uuid.workspace = true

+pq_proto.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -76,6 +76,7 @@ where

        let log_quietly = method == Method::GET;
        async move {
+            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
            if log_quietly {
                debug!("Handling request");
            } else {
@@ -87,7 +88,11 @@ where
            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
            //
            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
-            match (self.0)(request).await {
+            let res = (self.0)(request).await;
+
+            cancellation_guard.disarm();
+
+            match res {
                Ok(response) => {
                    let response_status = response.status();
                    if log_quietly && response_status.is_success() {
@@ -105,6 +110,40 @@ where
    }
 }

+/// Drop guard to WARN in case the request was dropped before completion.
+struct RequestCancelled {
+    warn: Option<tracing::Span>,
+}
+
+impl RequestCancelled {
+    /// Create the drop guard using the [`tracing::Span::current`] as the span.
+    fn warn_when_dropped_without_responding() -> Self {
+        RequestCancelled {
+            warn: Some(tracing::Span::current()),
+        }
+    }
+
+    /// Consume the drop guard without logging anything.
+    fn disarm(mut self) {
+        self.warn = None;
+    }
+}
+
+impl Drop for RequestCancelled {
+    fn drop(&mut self) {
+        if std::thread::panicking() {
+            // we are unwinding due to panicking, assume we are not dropped for cancellation
+        } else if let Some(span) = self.warn.take() {
+            // the span has all of the info already, but the outer `.instrument(span)` has already
+            // been dropped, so we need to manually re-enter it for this message.
+            //
+            // this is what the instrument would do before polling so it is fine.
+            let _g = span.entered();
+            warn!("request was dropped before completing");
+        }
+    }
+}
+
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -1,9 +1,7 @@
-use std::fmt::Display;
-
 use anyhow::Context;
 use bytes::Buf;
 use hyper::{header, Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};

 use super::error::ApiError;

@@ -33,12 +31,3 @@ pub fn json_response<T: Serialize>(
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
 }
-
-/// Serialize through Display trait.
-pub fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-    F: Display,
-{
-    s.serialize_str(&format!("{}", z))
-}
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -265,6 +265,26 @@ impl fmt::Display for TenantTimelineId {
    }
 }

+impl FromStr for TenantTimelineId {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut parts = s.split('/');
+        let tenant_id = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain tenant_id"))?
+            .parse()?;
+        let timeline_id = parts
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("TenantTimelineId must contain timeline_id"))?
+            .parse()?;
+        if parts.next().is_some() {
+            anyhow::bail!("TenantTimelineId must contain only tenant_id and timeline_id");
+        }
+        Ok(TenantTimelineId::new(tenant_id, timeline_id))
+    }
+}
+
 // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued
 // by the console.
 #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -54,6 +54,12 @@ pub mod measured_stream;
 pub mod serde_percent;
 pub mod serde_regex;

+pub mod pageserver_feedback;
+
+pub mod tracing_span_assert;
+
+pub mod rate_limit;
+
 /// use with fail::cfg("$name", "return(2000)")
 #[macro_export]
 macro_rules! failpoint_sleep_millis_async {
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,6 +1,7 @@
 use std::str::FromStr;

 use anyhow::Context;
+use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};

 #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
@@ -23,24 +24,81 @@ impl LogFormat {
    }
 }

-pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
-    let default_filter_str = "info";
+static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+    metrics::register_int_counter_vec!(
+        "libmetrics_tracing_event_count",
+        "Number of tracing events, by level",
+        &["level"]
+    )
+    .expect("failed to define metric")
+});

+struct TracingEventCountLayer(&'static metrics::IntCounterVec);
+
+impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
+where
+    S: tracing::Subscriber,
+{
+    fn on_event(
+        &self,
+        event: &tracing::Event<'_>,
+        _ctx: tracing_subscriber::layer::Context<'_, S>,
+    ) {
+        let level = event.metadata().level();
+        let level = match *level {
+            tracing::Level::ERROR => "error",
+            tracing::Level::WARN => "warn",
+            tracing::Level::INFO => "info",
+            tracing::Level::DEBUG => "debug",
+            tracing::Level::TRACE => "trace",
+        };
+        self.0.with_label_values(&[level]).inc();
+    }
+}
+
+/// Whether to add the `tracing_error` crate's `ErrorLayer`
+/// to the global tracing subscriber.
+///
+pub enum TracingErrorLayerEnablement {
+    /// Do not add the `ErrorLayer`.
+    Disabled,
+    /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
+    EnableWithRustLogFilter,
+}
+
+pub fn init(
+    log_format: LogFormat,
+    tracing_error_layer_enablement: TracingErrorLayerEnablement,
+) -> anyhow::Result<()> {
    // We fall back to printing all spans at info-level or above if
    // the RUST_LOG environment variable is not set.
-    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };

-    let base_logger = tracing_subscriber::fmt()
-        .with_env_filter(env_filter)
-        .with_target(false)
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_writer(std::io::stdout);
-
-    match log_format {
-        LogFormat::Json => base_logger.json().init(),
-        LogFormat::Plain => base_logger.init(),
-        LogFormat::Test => base_logger.with_test_writer().init(),
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    let r = tracing_subscriber::registry();
+    let r = r.with({
+        let log_layer = tracing_subscriber::fmt::layer()
+            .with_target(false)
+            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_writer(std::io::stdout);
+        let log_layer = match log_format {
+            LogFormat::Json => log_layer.json().boxed(),
+            LogFormat::Plain => log_layer.boxed(),
+            LogFormat::Test => log_layer.with_test_writer().boxed(),
+        };
+        log_layer.with_filter(rust_log_env_filter())
+    });
+    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
+    match tracing_error_layer_enablement {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter => r
+            .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
+            .init(),
+        TracingErrorLayerEnablement::Disabled => r.init(),
    }

    Ok(())
@@ -157,3 +215,33 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
        <Self as std::fmt::Display>::fmt(self, f)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use metrics::{core::Opts, IntCounterVec};
+
+    use super::TracingEventCountLayer;
+
+    #[test]
+    fn tracing_event_count_metric() {
+        let counter_vec =
+            IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
+        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
+        let layer = TracingEventCountLayer(counter_vec);
+        use tracing_subscriber::prelude::*;
+
+        tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
+            tracing::trace!("foo");
+            tracing::debug!("foo");
+            tracing::info!("foo");
+            tracing::warn!("foo");
+            tracing::error!("foo");
+        });
+
+        assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
+    }
+}
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -62,29 +62,48 @@ impl Lsn {
    }

    /// Compute the offset into a segment
+    #[inline]
    pub fn segment_offset(self, seg_sz: usize) -> usize {
        (self.0 % seg_sz as u64) as usize
    }

    /// Compute LSN of the segment start.
+    #[inline]
    pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
        Lsn(self.0 - (self.0 % seg_sz as u64))
    }

    /// Compute the segment number
+    #[inline]
    pub fn segment_number(self, seg_sz: usize) -> u64 {
        self.0 / seg_sz as u64
    }

    /// Compute the offset into a block
+    #[inline]
    pub fn block_offset(self) -> u64 {
        const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
        self.0 % BLCKSZ
    }

+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_lsn(self) -> Lsn {
+        Lsn(self.0 - self.block_offset())
+    }
+
+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 {
+        (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0
+    }
+
    /// Compute the bytes remaining in this block
    ///
    /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`.
+    #[inline]
    pub fn remaining_in_block(self) -> u64 {
        const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
        BLCKSZ - (self.0 % BLCKSZ)
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -0,0 +1,214 @@
+use std::time::{Duration, SystemTime};
+
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use pq_proto::{read_cstr, PG_EPOCH};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use tracing::{trace, warn};
+
+use crate::lsn::Lsn;
+
+/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+/// Serialized in custom flexible key/value format. In replication protocol, it
+/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
+/// Standby status update / Hot standby feedback messages.
+///
+/// serde Serialize is used only for human readable dump to json (e.g. in
+/// safekeepers debug_dump).
+#[serde_as]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct PageserverFeedback {
+    /// Last known size of the timeline. Used to enforce timeline size limit.
+    pub current_timeline_size: u64,
+    /// LSN last received and ingested by the pageserver. Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
+    pub last_received_lsn: Lsn,
+    /// LSN up to which data is persisted by the pageserver to its local disc.
+    /// Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
+    pub disk_consistent_lsn: Lsn,
+    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
+    /// consider WAL before it can be removed.
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn: Lsn,
+    // Serialize with RFC3339 format.
+    #[serde(with = "serde_systemtime")]
+    pub replytime: SystemTime,
+}
+
+// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
+// Do not remove previously available fields because this might be backwards incompatible.
+pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+
+impl PageserverFeedback {
+    pub fn empty() -> PageserverFeedback {
+        PageserverFeedback {
+            current_timeline_size: 0,
+            last_received_lsn: Lsn::INVALID,
+            remote_consistent_lsn: Lsn::INVALID,
+            disk_consistent_lsn: Lsn::INVALID,
+            replytime: *PG_EPOCH,
+        }
+    }
+
+    // Serialize PageserverFeedback using custom format
+    // to support protocol extensibility.
+    //
+    // Following layout is used:
+    // char - number of key-value pairs that follow.
+    //
+    // key-value pairs:
+    // null-terminated string - key,
+    // uint32 - value length in bytes
+    // value itself
+    //
+    // TODO: change serialized fields names once all computes migrate to rename.
+    pub fn serialize(&self, buf: &mut BytesMut) {
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
+        buf.put_slice(b"current_timeline_size\0");
+        buf.put_i32(8);
+        buf.put_u64(self.current_timeline_size);
+
+        buf.put_slice(b"ps_writelsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.last_received_lsn.0);
+        buf.put_slice(b"ps_flushlsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.disk_consistent_lsn.0);
+        buf.put_slice(b"ps_applylsn\0");
+        buf.put_i32(8);
+        buf.put_u64(self.remote_consistent_lsn.0);
+
+        let timestamp = self
+            .replytime
+            .duration_since(*PG_EPOCH)
+            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
+            .as_micros() as i64;
+
+        buf.put_slice(b"ps_replytime\0");
+        buf.put_i32(8);
+        buf.put_i64(timestamp);
+    }
+
+    // Deserialize PageserverFeedback message
+    // TODO: change serialized fields names once all computes migrate to rename.
+    pub fn parse(mut buf: Bytes) -> PageserverFeedback {
+        let mut rf = PageserverFeedback::empty();
+        let nfields = buf.get_u8();
+        for _ in 0..nfields {
+            let key = read_cstr(&mut buf).unwrap();
+            match key.as_ref() {
+                b"current_timeline_size" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.current_timeline_size = buf.get_u64();
+                }
+                b"ps_writelsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.last_received_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_flushlsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.disk_consistent_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_applylsn" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    rf.remote_consistent_lsn = Lsn(buf.get_u64());
+                }
+                b"ps_replytime" => {
+                    let len = buf.get_i32();
+                    assert_eq!(len, 8);
+                    let raw_time = buf.get_i64();
+                    if raw_time > 0 {
+                        rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
+                    } else {
+                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
+                    }
+                }
+                _ => {
+                    let len = buf.get_i32();
+                    warn!(
+                        "PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
+                        String::from_utf8_lossy(key.as_ref())
+                    );
+                    buf.advance(len as usize);
+                }
+            }
+        }
+        trace!("PageserverFeedback parsed is {:?}", rf);
+        rf
+    }
+}
+
+mod serde_systemtime {
+    use std::time::SystemTime;
+
+    use chrono::{DateTime, Utc};
+    use serde::{Deserialize, Deserializer, Serializer};
+
+    pub fn serialize<S>(ts: &SystemTime, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let chrono_dt: DateTime<Utc> = (*ts).into();
+        serializer.serialize_str(&chrono_dt.to_rfc3339())
+    }
+
+    pub fn deserialize<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let time: String = Deserialize::deserialize(deserializer)?;
+        Ok(DateTime::parse_from_rfc3339(&time)
+            .map_err(serde::de::Error::custom)?
+            .into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_replication_feedback_serialization() {
+        let mut rf = PageserverFeedback::empty();
+        // Fill rf with some values
+        rf.current_timeline_size = 12345678;
+        // Set rounded time to be able to compare it with deserialized value,
+        // because it is rounded up to microseconds during serialization.
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        let mut data = BytesMut::new();
+        rf.serialize(&mut data);
+
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
+        assert_eq!(rf, rf_parsed);
+    }
+
+    #[test]
+    fn test_replication_feedback_unknown_key() {
+        let mut rf = PageserverFeedback::empty();
+        // Fill rf with some values
+        rf.current_timeline_size = 12345678;
+        // Set rounded time to be able to compare it with deserialized value,
+        // because it is rounded up to microseconds during serialization.
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        let mut data = BytesMut::new();
+        rf.serialize(&mut data);
+
+        // Add an extra field to the buffer and adjust number of keys
+        if let Some(first) = data.first_mut() {
+            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
+        }
+
+        data.put_slice(b"new_field_one\0");
+        data.put_i32(8);
+        data.put_u64(42);
+
+        // Parse serialized data and check that new field is not parsed
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
+        assert_eq!(rf, rf_parsed);
+    }
+}
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -0,0 +1,66 @@
+//! A helper to rate limit operations.
+
+use std::time::{Duration, Instant};
+
+pub struct RateLimit {
+    last: Option<Instant>,
+    interval: Duration,
+}
+
+impl RateLimit {
+    pub fn new(interval: Duration) -> Self {
+        Self {
+            last: None,
+            interval,
+        }
+    }
+
+    /// Call `f` if the rate limit allows.
+    /// Don't call it otherwise.
+    pub fn call<F: FnOnce()>(&mut self, f: F) {
+        let now = Instant::now();
+        match self.last {
+            Some(last) if now - last <= self.interval => {
+                // ratelimit
+            }
+            _ => {
+                self.last = Some(now);
+                f();
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::atomic::AtomicUsize;
+
+    #[test]
+    fn basics() {
+        use super::RateLimit;
+        use std::sync::atomic::Ordering::Relaxed;
+        use std::time::Duration;
+
+        let called = AtomicUsize::new(0);
+        let mut f = RateLimit::new(Duration::from_millis(100));
+
+        let cl = || {
+            called.fetch_add(1, Relaxed);
+        };
+
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 1);
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 1);
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 1);
+        std::thread::sleep(Duration::from_millis(100));
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 2);
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 2);
+        std::thread::sleep(Duration::from_millis(100));
+        f.call(cl);
+        assert_eq!(called.load(Relaxed), 3);
+    }
+}
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -0,0 +1,287 @@
+//! Assert that the current [`tracing::Span`] has a given set of fields.
+//!
+//! # Usage
+//!
+//! ```
+//! use tracing_subscriber::prelude::*;
+//! let registry = tracing_subscriber::registry()
+//!    .with(tracing_error::ErrorLayer::default());
+//!
+//! // Register the registry as the global subscriber.
+//! // In this example, we'll only use it as a thread-local subscriber.
+//! let _guard = tracing::subscriber::set_default(registry);
+//!
+//! // Then, in the main code:
+//!
+//! let span = tracing::info_span!("TestSpan", test_id = 1);
+//! let _guard = span.enter();
+//!
+//! // ... down the call stack
+//!
+//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
+//! match check_fields_present([&extractor]) {
+//!    Ok(()) => {},
+//!    Err(missing) => {
+//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
+//!    }
+//! }
+//! ```
+//!
+//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+//!
+
+use std::{
+    collections::HashSet,
+    fmt::{self},
+    hash::{Hash, Hasher},
+};
+
+pub enum ExtractionResult {
+    Present,
+    Absent,
+}
+
+pub trait Extractor: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> &str;
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
+}
+
+#[derive(Debug)]
+pub struct MultiNameExtractor<const L: usize> {
+    name: &'static str,
+    field_names: [&'static str; L],
+}
+
+impl<const L: usize> MultiNameExtractor<L> {
+    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
+        MultiNameExtractor { name, field_names }
+    }
+}
+impl<const L: usize> Extractor for MultiNameExtractor<L> {
+    fn name(&self) -> &str {
+        self.name
+    }
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
+        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
+            ExtractionResult::Present
+        } else {
+            ExtractionResult::Absent
+        }
+    }
+}
+
+struct MemoryIdentity<'a>(&'a dyn Extractor);
+
+impl<'a> MemoryIdentity<'a> {
+    fn as_ptr(&self) -> *const () {
+        self.0 as *const _ as *const ()
+    }
+}
+impl<'a> PartialEq for MemoryIdentity<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ptr() == other.as_ptr()
+    }
+}
+impl<'a> Eq for MemoryIdentity<'a> {}
+impl<'a> Hash for MemoryIdentity<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_ptr().hash(state);
+    }
+}
+impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
+    }
+}
+
+/// The extractor names passed as keys to [`new`].
+pub fn check_fields_present<const L: usize>(
+    must_be_present: [&dyn Extractor; L],
+) -> Result<(), Vec<&dyn Extractor>> {
+    let mut missing: HashSet<MemoryIdentity> =
+        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+    let trace = tracing_error::SpanTrace::capture();
+    trace.with_spans(|md, _formatted_fields| {
+        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+            ExtractionResult::Present => false,
+            ExtractionResult::Absent => true,
+        });
+        !missing.is_empty() // continue walking up until we've found all missing
+    });
+    if missing.is_empty() {
+        Ok(())
+    } else {
+        Err(missing.into_iter().map(|mi| mi.0).collect())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use tracing_subscriber::prelude::*;
+
+    use super::*;
+
+    struct Setup {
+        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
+        tenant_extractor: MultiNameExtractor<2>,
+        timeline_extractor: MultiNameExtractor<2>,
+    }
+
+    fn setup_current_thread() -> Setup {
+        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
+        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
+
+        let registry = tracing_subscriber::registry()
+            .with(tracing_subscriber::fmt::layer())
+            .with(tracing_error::ErrorLayer::default());
+
+        let guard = tracing::subscriber::set_default(registry);
+
+        Setup {
+            _current_thread_subscriber_guard: guard,
+            tenant_extractor,
+            timeline_extractor,
+        }
+    }
+
+    fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) {
+        let missing: HashSet<MemoryIdentity> =
+            HashSet::from_iter(missing.into_iter().map(MemoryIdentity));
+        let expected: HashSet<MemoryIdentity> =
+            HashSet::from_iter(expected.into_iter().map(MemoryIdentity));
+        assert_eq!(missing, expected);
+    }
+
+    #[test]
+    fn positive_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing =
+            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn positive_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn negative_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn tracing_error_subscriber_not_set_up() {
+        // no setup
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
+        let r = tracing_subscriber::registry().with({
+            tracing_error::ErrorLayer::default().with_filter(
+                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
+                    if md.is_span() && *md.level() == tracing::Level::INFO {
+                        return false;
+                    }
+                    true
+                }),
+            )
+        });
+
+        let _guard = tracing::subscriber::set_default(r);
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -52,6 +52,7 @@ sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -13,7 +13,7 @@ use std::time::Instant;

 use utils::lsn::Lsn;

-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};

 fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
    let mut layer_map = LayerMap::<LayerDescriptor>::default();
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
        min_lsn = min(min_lsn, lsn_range.start);
        max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));

-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
    }

    println!("min: {min_lsn}, max: {max_lsn}");
@@ -114,7 +114,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
    c.bench_function("captest_uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
            }
        });
    });
@@ -122,11 +122,11 @@ fn bench_from_captest_env(c: &mut Criterion) {
    // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs.
    c.bench_function("captest_rel_dir_query", |b| {
        b.iter(|| {
-            let result = layer_map.search(
+            let result = black_box(layer_map.search(
                Key::from_hex("000000067F00008000000000000000000001").unwrap(),
                // This LSN is higher than any of the LSNs in the tree
                Lsn::from_str("D0/80208AE1").unwrap(),
-            );
+            ));
            result.unwrap();
        });
    });
@@ -183,7 +183,7 @@ fn bench_from_real_project(c: &mut Criterion) {
    group.bench_function("uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
            }
        });
    });
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
            is_incremental: false,
            short_id: format!("Layer {}", i),
        };
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
    }
    updates.flush();
    println!("Finished layer map init in {:?}", now.elapsed());
@@ -232,7 +232,7 @@ fn bench_sequential(c: &mut Criterion) {
    group.bench_function("uniform_queries", |b| {
        b.iter(|| {
            for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1);
+                black_box(layer_map.search(q.0, q.1));
            }
        });
    });
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -463,9 +463,13 @@ where
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;

-        let wal_seg =
-            postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
-                .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
+        let wal_seg = postgres_ffi::generate_wal_segment(
+            segno,
+            system_identifier,
+            self.timeline.pg_version,
+            self.lsn,
+        )
+        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..]).await?;
        Ok(())
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -25,6 +25,7 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
@@ -86,8 +87,19 @@ fn main() -> anyhow::Result<()> {
        }
    };

-    // Initialize logging, which must be initialized before the custom panic hook is installed.
-    logging::init(conf.log_format)?;
+    // Initialize logging.
+    //
+    // It must be initialized before the custom panic hook is installed below.
+    //
+    // Regarding tracing_error enablement: at this time, we only use the
+    // tracing_error crate to debug_assert that log spans contain tenant and timeline ids.
+    // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module
+    let tracing_error_layer_enablement = if cfg!(debug_assertions) {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter
+    } else {
+        TracingErrorLayerEnablement::Disabled
+    };
+    logging::init(conf.log_format, tracing_error_layer_enablement)?;

    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
    // disarming this hook on pageserver, because we never tear down tracing.
@@ -226,6 +238,7 @@ fn start_pageserver(
    );
    set_build_info_metric(GIT_VERSION);
    set_launch_timestamp_metric(launch_ts);
+    pageserver::preinitialize_metrics();

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -6,6 +6,7 @@

 use anyhow::{anyhow, bail, ensure, Context, Result};
 use remote_storage::{RemotePath, RemoteStorageConfig};
+use serde::de::IntoDeserializer;
 use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
@@ -62,7 +63,6 @@ pub mod defaults {
    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";

    ///
    /// Default built-in configuration file.
@@ -91,7 +91,6 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

-#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'

 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}

@@ -108,6 +107,7 @@ pub mod defaults {
 #pitr_interval = '{DEFAULT_PITR_INTERVAL}'

 #min_resident_size_override = .. # in bytes
+#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'

 # [remote_storage]

@@ -182,9 +182,6 @@ pub struct PageServerConf {
    pub metric_collection_endpoint: Option<Url>,
    pub synthetic_size_calculation_interval: Duration,

-    // See the corresponding metric's help string.
-    pub evictions_low_residence_duration_metric_threshold: Duration,
-
    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,

    pub test_remote_failures: u64,
@@ -257,8 +254,6 @@ struct PageServerConfigBuilder {
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,

-    evictions_low_residence_duration_metric_threshold: BuilderValue<Duration>,
-
    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,

    test_remote_failures: BuilderValue<u64>,
@@ -316,11 +311,6 @@ impl Default for PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),

-            evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration(
-                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
-            )
-            .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")),
-
            disk_usage_based_eviction: Set(None),

            test_remote_failures: Set(0),
@@ -438,10 +428,6 @@ impl PageServerConfigBuilder {
        self.test_remote_failures = BuilderValue::Set(fail_first);
    }

-    pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) {
-        self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value);
-    }
-
    pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
        self.disk_usage_based_eviction = BuilderValue::Set(value);
    }
@@ -525,11 +511,6 @@ impl PageServerConfigBuilder {
            synthetic_size_calculation_interval: self
                .synthetic_size_calculation_interval
                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
-            evictions_low_residence_duration_metric_threshold: self
-                .evictions_low_residence_duration_metric_threshold
-                .ok_or(anyhow!(
-                    "missing evictions_low_residence_duration_metric_threshold"
-                ))?,
            disk_usage_based_eviction: self
                .disk_usage_based_eviction
                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
@@ -721,12 +702,12 @@ impl PageServerConf {
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
-                "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?),
                "disk_usage_based_eviction" => {
                    tracing::info!("disk_usage_based_eviction: {:#?}", &item);
                    builder.disk_usage_based_eviction(
-                    toml_edit::de::from_item(item.clone())
-                    .context("parse disk_usage_based_eviction")?)
+                        deserialize_from_item("disk_usage_based_eviction", item)
+                            .context("parse disk_usage_based_eviction")?
+                    )
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
@@ -827,18 +808,25 @@ impl PageServerConf {

        if let Some(eviction_policy) = item.get("eviction_policy") {
            t_conf.eviction_policy = Some(
-                toml_edit::de::from_item(eviction_policy.clone())
+                deserialize_from_item("eviction_policy", eviction_policy)
                    .context("parse eviction_policy")?,
            );
        }

        if let Some(item) = item.get("min_resident_size_override") {
            t_conf.min_resident_size_override = Some(
-                toml_edit::de::from_item(item.clone())
+                deserialize_from_item("min_resident_size_override", item)
                    .context("parse min_resident_size_override")?,
            );
        }

+        if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
+            t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
+                "evictions_low_residence_duration_metric_threshold",
+                item,
+            )?);
+        }
+
        Ok(t_conf)
    }

@@ -877,10 +865,6 @@ impl PageServerConf {
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            synthetic_size_calculation_interval: Duration::from_secs(60),
-            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
-                defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
-            )
-            .unwrap(),
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
@@ -938,6 +922,18 @@ where
    })
 }

+fn deserialize_from_item<T>(name: &str, item: &Item) -> anyhow::Result<T>
+where
+    T: serde::de::DeserializeOwned,
+{
+    // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way
+    let deserializer = match item.clone().into_value() {
+        Ok(value) => value.into_deserializer(),
+        Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"),
+    };
+    T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}"))
+}
+
 /// Configurable semaphore permits setting.
 ///
 /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -1004,9 +1000,10 @@ mod tests {

    use remote_storage::{RemoteStorageKind, S3Config};
    use tempfile::{tempdir, TempDir};
+    use utils::serde_percent::Percent;

    use super::*;
-    use crate::DEFAULT_PG_VERSION;
+    use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION};

    const ALL_BASE_VALUES_TOML: &str = r#"
 # Initial configuration file created by 'pageserver --init'
@@ -1029,8 +1026,6 @@ cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

-evictions_low_residence_duration_metric_threshold = '444 s'
-
 log_format = 'json'

 "#;
@@ -1087,9 +1082,6 @@ log_format = 'json'
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
-                evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
-                    defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD
-                )?,
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
@@ -1144,7 +1136,6 @@ log_format = 'json'
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                synthetic_size_calculation_interval: Duration::from_secs(333),
-                evictions_low_residence_duration_metric_threshold: Duration::from_secs(444),
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
@@ -1310,6 +1301,71 @@ trace_read_requests = {trace_read_requests}"#,
        Ok(())
    }

+    #[test]
+    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let pageserver_conf_toml = format!(
+            r#"pg_distrib_dir = "{}"
+metric_collection_endpoint = "http://sample.url"
+metric_collection_interval = "10min"
+id = 222
+
+[disk_usage_based_eviction]
+max_usage_pct = 80
+min_avail_bytes = 0
+period = "10s"
+
+[tenant_config]
+evictions_low_residence_duration_metric_threshold = "20m"
+
+[tenant_config.eviction_policy]
+kind = "LayerAccessThreshold"
+period = "20m"
+threshold = "20m"
+"#,
+            pg_distrib_dir.display(),
+        );
+        let toml: Document = pageserver_conf_toml.parse()?;
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+
+        assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
+        assert_eq!(
+            conf.metric_collection_endpoint,
+            Some("http://sample.url".parse().unwrap())
+        );
+        assert_eq!(
+            conf.metric_collection_interval,
+            Duration::from_secs(10 * 60)
+        );
+        assert_eq!(
+            conf.default_tenant_conf
+                .evictions_low_residence_duration_metric_threshold,
+            Duration::from_secs(20 * 60)
+        );
+        assert_eq!(conf.id, NodeId(222));
+        assert_eq!(
+            conf.disk_usage_based_eviction,
+            Some(DiskUsageEvictionTaskConfig {
+                max_usage_pct: Percent::new(80).unwrap(),
+                min_avail_bytes: 0,
+                period: Duration::from_secs(10),
+                #[cfg(feature = "testing")]
+                mock_statvfs: None,
+            })
+        );
+        match &conf.default_tenant_conf.eviction_policy {
+            EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"),
+            EvictionPolicy::LayerAccessThreshold(eviction_thresold) => {
+                assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60));
+                assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60));
+            }
+        }
+
+        Ok(())
+    }
+
    fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
        let tempdir_path = tempdir.path();

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -520,6 +520,43 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_id}/synthetic_size:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: |
+        Calculate tenant's synthetic size
+      responses:
+        "200":
+          description: Tenant's synthetic size
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
@@ -948,6 +985,84 @@ components:
        latest_gc_cutoff_lsn:
          type: string
          format: hex
+
+    SyntheticSizeResponse:
+      type: object
+      required:
+        - id
+        - size
+        - segment_sizes
+        - inputs
+      properties:
+        id:
+          type: string
+          format: hex
+        size:
+          type: integer
+        segment_sizes:
+          type: array
+          items:
+            $ref: "#/components/schemas/SegmentSize"
+        inputs:
+          type: object
+          properties:
+            segments:
+              type: array
+              items:
+                $ref: "#/components/schemas/SegmentData"
+            timeline_inputs:
+              type: array
+              items:
+                $ref: "#/components/schemas/TimelineInput"
+
+    SegmentSize:
+      type: object
+      required:
+        - method
+        - accum_size
+      properties:
+        method:
+          type: string
+        accum_size:
+          type: integer
+
+    SegmentData:
+      type: object
+      required:
+        - segment
+      properties:
+        segment:
+          type: object
+          required:
+            - lsn
+          properties:
+            parent:
+              type: integer
+            lsn:
+              type: integer
+            size:
+              type: integer
+            needed:
+              type: boolean
+        timeline_id:
+          type: string
+          format: hex
+        kind:
+          type: string
+
+    TimelineInput:
+      type: object
+      required:
+        - timeline_id
+      properties:
+        ancestor_id:
+          type: string
+        ancestor_lsn:
+          type: string
+        timeline_id:
+          type: string
+          format: hex
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -691,16 +691,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

-// Helper function to standardize the error messages we produce on bad durations
-//
-// Intended to be used with anyhow's `with_context`, e.g.:
-//
-//   let value = result.with_context(bad_duration("name", &value))?;
-//
-fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
-    move || format!("Cannot parse `{field_name}` duration {value:?}")
-}
-
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

@@ -708,78 +698,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo

    let request_data: TenantCreateRequest = json_request(&mut request).await?;

-    let mut tenant_conf = TenantConfOpt::default();
-    if let Some(gc_period) = request_data.gc_period {
-        tenant_conf.gc_period = Some(
-            humantime::parse_duration(&gc_period)
-                .with_context(bad_duration("gc_period", &gc_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.gc_horizon = request_data.gc_horizon;
-    tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-
-    if let Some(pitr_interval) = request_data.pitr_interval {
-        tenant_conf.pitr_interval = Some(
-            humantime::parse_duration(&pitr_interval)
-                .with_context(bad_duration("pitr_interval", &pitr_interval))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
-        tenant_conf.walreceiver_connect_timeout = Some(
-            humantime::parse_duration(&walreceiver_connect_timeout)
-                .with_context(bad_duration(
-                    "walreceiver_connect_timeout",
-                    &walreceiver_connect_timeout,
-                ))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
-        tenant_conf.lagging_wal_timeout = Some(
-            humantime::parse_duration(&lagging_wal_timeout)
-                .with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
-        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
-    }
-    if let Some(trace_read_requests) = request_data.trace_read_requests {
-        tenant_conf.trace_read_requests = Some(trace_read_requests);
-    }
-
-    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
-        tenant_conf.checkpoint_timeout = Some(
-            humantime::parse_duration(&checkpoint_timeout)
-                .with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    tenant_conf.compaction_target_size = request_data.compaction_target_size;
-    tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-    if let Some(compaction_period) = request_data.compaction_period {
-        tenant_conf.compaction_period = Some(
-            humantime::parse_duration(&compaction_period)
-                .with_context(bad_duration("compaction_period", &compaction_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    if let Some(eviction_policy) = request_data.eviction_policy {
-        tenant_conf.eviction_policy = Some(
-            serde_json::from_value(eviction_policy)
-                .context("parse field `eviction_policy`")
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+    let tenant_conf = TenantConfOpt::try_from(&request_data).map_err(ApiError::BadRequest)?;

    let target_tenant_id = request_data
        .new_tenant_id
@@ -847,72 +766,7 @@ async fn update_tenant_config_handler(
    let tenant_id = request_data.tenant_id;
    check_permission(&request, Some(tenant_id))?;

-    let mut tenant_conf = TenantConfOpt::default();
-    if let Some(gc_period) = request_data.gc_period {
-        tenant_conf.gc_period = Some(
-            humantime::parse_duration(&gc_period)
-                .with_context(bad_duration("gc_period", &gc_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.gc_horizon = request_data.gc_horizon;
-    tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-
-    if let Some(pitr_interval) = request_data.pitr_interval {
-        tenant_conf.pitr_interval = Some(
-            humantime::parse_duration(&pitr_interval)
-                .with_context(bad_duration("pitr_interval", &pitr_interval))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
-        tenant_conf.walreceiver_connect_timeout = Some(
-            humantime::parse_duration(&walreceiver_connect_timeout)
-                .with_context(bad_duration(
-                    "walreceiver_connect_timeout",
-                    &walreceiver_connect_timeout,
-                ))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
-        tenant_conf.lagging_wal_timeout = Some(
-            humantime::parse_duration(&lagging_wal_timeout)
-                .with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
-    tenant_conf.trace_read_requests = request_data.trace_read_requests;
-
-    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
-        tenant_conf.checkpoint_timeout = Some(
-            humantime::parse_duration(&checkpoint_timeout)
-                .with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-    tenant_conf.compaction_target_size = request_data.compaction_target_size;
-    tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-    if let Some(compaction_period) = request_data.compaction_period {
-        tenant_conf.compaction_period = Some(
-            humantime::parse_duration(&compaction_period)
-                .with_context(bad_duration("compaction_period", &compaction_period))
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    if let Some(eviction_policy) = request_data.eviction_policy {
-        tenant_conf.eviction_policy = Some(
-            serde_json::from_value(eviction_policy)
-                .context("parse field `eviction_policy`")
-                .map_err(ApiError::BadRequest)?,
-        );
-    }
-
-    tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+    let tenant_conf = TenantConfOpt::try_from(&request_data).map_err(ApiError::BadRequest)?;

    let state = get_state(&request);
    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
@@ -1175,6 +1029,37 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    )
 }

+#[cfg(feature = "testing")]
+async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    #[derive(Debug, serde::Deserialize)]
+    #[serde(rename_all = "lowercase")]
+    enum Level {
+        Error,
+        Warn,
+        Info,
+        Debug,
+        Trace,
+    }
+    #[derive(Debug, serde::Deserialize)]
+    struct Request {
+        level: Level,
+        message: String,
+    }
+    let body: Request = json_request(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+
+    match body.level {
+        Level::Error => tracing::error!(?body.message),
+        Level::Warn => tracing::warn!(?body.message),
+        Level::Info => tracing::info!(?body.message),
+        Level::Debug => tracing::debug!(?body.message),
+        Level::Trace => tracing::trace!(?body.message),
+    }
+
+    json_response(StatusCode::OK, ())
+}
+
 pub fn make_router(
    conf: &'static PageServerConf,
    launch_ts: &'static LaunchTimestamp,
@@ -1315,5 +1200,9 @@ pub fn make_router(
            testing_api!("set tenant state to broken", handle_tenant_break),
        )
        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
+        .post(
+            "/v1/tracing/event",
+            testing_api!("emit a tracing event", post_tracing_event_handler),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -114,7 +114,7 @@ async fn import_rel(
    path: &Path,
    spcoid: Oid,
    dboid: Oid,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
    len: usize,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -200,7 +200,7 @@ async fn import_slru(
    modification: &mut DatadirModification<'_>,
    slru: SlruKind,
    path: &Path,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
    len: usize,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -612,8 +612,8 @@ async fn import_file(
    Ok(None)
 }

-async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
+async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes> {
    let mut buf: Vec<u8> = vec![];
    reader.read_to_end(&mut buf).await?;
-    Ok(Bytes::copy_from_slice(&buf[..]))
+    Ok(Bytes::from(buf))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -44,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

+pub use crate::metrics::preinitialize_metrics;
+
 pub async fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -205,6 +205,15 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_unexpected_ondemand_downloads_count",
+        "Number of unexpected on-demand downloads. \
+         We log more context for each increment, so, forgo any labels in this metric.",
+    )
+    .expect("failed to define a metric")
+});
+
 /// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
@@ -257,19 +266,54 @@ impl EvictionsWithLowResidenceDuration {
        }
    }

+    pub fn change_threshold(
+        &mut self,
+        tenant_id: &str,
+        timeline_id: &str,
+        new_threshold: Duration,
+    ) {
+        if new_threshold == self.threshold {
+            return;
+        }
+        let mut with_new =
+            EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
+                .build(tenant_id, timeline_id);
+        std::mem::swap(self, &mut with_new);
+        with_new.remove(tenant_id, timeline_id);
+    }
+
    // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
        let Some(_counter) = self.counter.take() else {
            return;
        };
-        EVICTIONS_WITH_LOW_RESIDENCE_DURATION
-            .remove_label_values(&[
-                tenant_id,
-                timeline_id,
-                self.data_source,
-                &Self::threshold_label_value(self.threshold),
-            ])
-            .expect("we own the metric, no-one else should remove it");
+
+        let threshold = Self::threshold_label_value(self.threshold);
+
+        let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
+            tenant_id,
+            timeline_id,
+            self.data_source,
+            &threshold,
+        ]);
+
+        match removed {
+            Err(e) => {
+                // this has been hit in staging as
+                // <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
+                // because we can be in the drop path already, don't risk:
+                // - "double-panic => illegal instruction" or
+                // - future "drop panick => abort"
+                //
+                // so just nag: (the error has the labels)
+                tracing::warn!("failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}");
+            }
+            Ok(()) => {
+                // to help identify cases where we double-remove the same values, let's log all
+                // deletions?
+                tracing::info!("removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}", self.data_source);
+            }
+        }
    }
 }

@@ -334,11 +378,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
-        .expect("failed to define a metric")
-});
-
 // remote storage metrics

 /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
@@ -369,6 +408,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_started",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation is scheduled.",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
+static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_finished",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -419,6 +478,56 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

+// walreceiver metrics
+
+pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_walreceiver_started_connections_total",
+        "Number of started walreceiver connections"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_walreceiver_active_managers",
+        "Number of active walreceiver managers"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_walreceiver_switches_total",
+        "Number of walreceiver manager change_connection calls",
+        &["reason"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_walreceiver_broker_updates_total",
+        "Number of received broker updates in walreceiver"
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_walreceiver_candidates_events_total",
+        "Number of walreceiver candidate events",
+        &["event"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
+
+pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
@@ -589,7 +698,7 @@ pub struct TimelineMetrics {
    pub num_persistent_files_created: IntCounter,
    pub persistent_bytes_written: IntCounter,
    pub evictions: IntCounter,
-    pub evictions_with_low_residence_duration: EvictionsWithLowResidenceDuration,
+    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }

 impl TimelineMetrics {
@@ -656,7 +765,9 @@ impl TimelineMetrics {
            num_persistent_files_created,
            persistent_bytes_written,
            evictions,
-            evictions_with_low_residence_duration,
+            evictions_with_low_residence_duration: std::sync::RwLock::new(
+                evictions_with_low_residence_duration,
+            ),
        }
    }
 }
@@ -675,6 +786,8 @@ impl Drop for TimelineMetrics {
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
        self.evictions_with_low_residence_duration
+            .write()
+            .unwrap()
            .remove(tenant_id, timeline_id);
        for op in STORAGE_TIME_OPERATIONS {
            let _ =
@@ -719,6 +832,8 @@ pub struct RemoteTimelineClientMetrics {
    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
+    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }

 impl RemoteTimelineClientMetrics {
@@ -729,6 +844,8 @@ impl RemoteTimelineClientMetrics {
            remote_operation_time: Mutex::new(HashMap::default()),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            calls_started_hist: Mutex::new(HashMap::default()),
+            bytes_started_counter: Mutex::new(HashMap::default()),
+            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
        }
    }
@@ -767,6 +884,7 @@ impl RemoteTimelineClientMetrics {
        });
        metric.clone()
    }
+
    fn calls_unfinished_gauge(
        &self,
        file_kind: &RemoteOpFileKind,
@@ -808,32 +926,125 @@ impl RemoteTimelineClientMetrics {
        });
        metric.clone()
    }
+
+    fn bytes_started_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+
+    fn bytes_finished_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+}
+
+#[cfg(test)]
+impl RemoteTimelineClientMetrics {
+    pub fn get_bytes_started_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
+
+    pub fn get_bytes_finished_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
 }

 /// See [`RemoteTimelineClientMetrics::call_begin`].
 #[must_use]
-pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
+pub(crate) struct RemoteTimelineClientCallMetricGuard {
+    /// Decremented on drop.
+    calls_unfinished_metric: Option<IntGauge>,
+    /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
+    bytes_finished: Option<(IntCounter, u64)>,
+}

 impl RemoteTimelineClientCallMetricGuard {
-    /// Consume this guard object without decrementing the metric.
-    /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
+    /// Consume this guard object without performing the metric updates it would do on `drop()`.
+    /// The caller vouches to do the metric updates manually.
    pub fn will_decrement_manually(mut self) {
-        self.0 = None; // prevent drop() from decrementing
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = &mut self;
+        calls_unfinished_metric.take();
+        bytes_finished.take();
    }
 }

 impl Drop for RemoteTimelineClientCallMetricGuard {
    fn drop(&mut self) {
-        if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = self;
+        if let Some(guard) = calls_unfinished_metric.take() {
            guard.dec();
        }
+        if let Some((bytes_finished_metric, value)) = bytes_finished {
+            bytes_finished_metric.inc_by(*value);
+        }
    }
 }

+/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
+/// track the byte size of this call in applicable metric(s).
+pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
+    /// Do not account for this call's byte size in any metrics.
+    /// The `reason` field is there to make the call sites self-documenting
+    /// about why they don't need the metric.
+    DontTrackSize { reason: &'static str },
+    /// Track the byte size of the call in applicable metric(s).
+    Bytes(u64),
+}
+
 impl RemoteTimelineClientMetrics {
-    /// Increment the metrics that track ongoing calls to the remote timeline client instance.
+    /// Update the metrics that change when a call to the remote timeline client instance starts.
    ///
-    /// Drop the returned guard object once the operation is finished to decrement the values.
+    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
    /// is more suitable.
    /// Never do both.
@@ -841,24 +1052,51 @@ impl RemoteTimelineClientMetrics {
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
    ) -> RemoteTimelineClientCallMetricGuard {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
        self.calls_started_hist(file_kind, op_kind)
-            .observe(unfinished_metric.get() as f64);
-        unfinished_metric.inc();
-        RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
+            .observe(calls_unfinished_metric.get() as f64);
+        calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
+
+        let bytes_finished = match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
+                // nothing to do
+                None
+            }
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_started_counter(file_kind, op_kind).inc_by(size);
+                let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
+                Some((finished_counter, size))
+            }
+        };
+        RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric: Some(calls_unfinished_metric),
+            bytes_finished,
+        }
    }

-    /// Manually decrement the metric instead of using the guard object.
+    /// Manually udpate the metrics that track completions, instead of using the guard object.
    /// Using the guard object is generally preferable.
    /// See [`call_begin`] for more context.
-    pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+    pub(crate) fn call_end(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
+    ) {
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
        debug_assert!(
-            unfinished_metric.get() > 0,
+            calls_unfinished_metric.get() > 0,
            "begin and end should cancel out"
        );
-        unfinished_metric.dec();
+        calls_unfinished_metric.dec();
+        match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
+            }
+        }
    }
 }

@@ -871,6 +1109,8 @@ impl Drop for RemoteTimelineClientMetrics {
            remote_operation_time,
            calls_unfinished_gauge,
            calls_started_hist,
+            bytes_started_counter,
+            bytes_finished_counter,
        } = self;
        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
@@ -891,6 +1131,22 @@ impl Drop for RemoteTimelineClientMetrics {
                b,
            ]);
        }
+        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
        {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -954,3 +1210,10 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
        poll_result
    }
 }
+
+pub fn preinitialize_metrics() {
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -20,7 +20,6 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
-use postgres_backend::PostgresBackendTCP;
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -32,6 +31,7 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -57,7 +57,10 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<Bytes>> + '_ {
+fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
+where
+    IO: AsyncRead + AsyncWrite + Unpin,
+{
    async_stream::try_stream! {
        loop {
            let msg = tokio::select! {
@@ -65,8 +68,8 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<

                _ = task_mgr::shutdown_watcher() => {
                    // We were requested to shut down.
-                    let msg = format!("pageserver is shutting down");
-                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None));
+                    let msg = "pageserver is shutting down";
+                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
                    Err(QueryError::Other(anyhow::anyhow!(msg)))
                }

@@ -125,7 +128,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<
 ///
 /// XXX: Currently, any trailing data after the EOF marker prints a warning.
 /// Perhaps it should be a hard error?
-async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
    use tokio::io::AsyncReadExt;
    let mut buf = [0u8; 512];

@@ -245,12 +248,23 @@ async fn page_service_conn_main(
        .set_nodelay(true)
        .context("could not set TCP_NODELAY")?;

+    let peer_addr = socket.peer_addr().context("get peer address")?;
+
+    // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
+    // - long enough for most valid compute connections
+    // - less than infinite to stop us from "leaking" connections to long-gone computes
+    //
+    // no write timeout is used, because the kernel is assumed to error writes after some time.
+    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
+    let socket = std::pin::pin!(socket);
+
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
        .run(&mut conn_handler, task_mgr::shutdown_watcher)
@@ -332,13 +346,16 @@ impl PageServerHandler {
    }

    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_pagerequests(
+    async fn handle_pagerequests<IO>(
        &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
        // NOTE: pagerequests handler exits when connection is closed,
        //       so there is no need to reset the association
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
@@ -381,7 +398,9 @@ impl PageServerHandler {
                Some(FeMessage::CopyData(bytes)) => bytes,
                Some(FeMessage::Terminate) => break,
                Some(m) => {
-                    anyhow::bail!("unexpected message: {m:?} during COPY");
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "unexpected message: {m:?} during COPY"
+                    )));
                }
                None => break, // client disconnected
            };
@@ -436,16 +455,19 @@ impl PageServerHandler {

    #[allow(clippy::too_many_arguments)]
    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_basebackup(
+    async fn handle_import_basebackup<IO>(
        &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        base_lsn: Lsn,
        _end_lsn: Lsn,
        pg_version: u32,
        ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
@@ -486,15 +508,18 @@ impl PageServerHandler {
    }

    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_wal(
+    async fn handle_import_wal<IO>(
        &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        start_lsn: Lsn,
        end_lsn: Lsn,
        ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
@@ -690,16 +715,21 @@ impl PageServerHandler {

    #[allow(clippy::too_many_arguments)]
    #[instrument(skip(self, pgb, ctx))]
-    async fn handle_basebackup_request(
+    async fn handle_basebackup_request<IO>(
        &mut self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        lsn: Option<Lsn>,
        prev_lsn: Option<Lsn>,
        full_backup: bool,
        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<()>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let started = std::time::Instant::now();
+
        // check that the timeline exists
        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
@@ -712,6 +742,8 @@ impl PageServerHandler {
                .context("invalid basebackup lsn")?;
        }

+        let lsn_awaited_after = started.elapsed();
+
        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        pgb.flush().await?;
@@ -732,7 +764,17 @@ impl PageServerHandler {

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
        pgb.flush().await?;
-        info!("basebackup complete");
+
+        let basebackup_after = started
+            .elapsed()
+            .checked_sub(lsn_awaited_after)
+            .unwrap_or(Duration::ZERO);
+
+        info!(
+            lsn_await_millis = lsn_awaited_after.as_millis(),
+            basebackup_millis = basebackup_after.as_millis(),
+            "basebackup complete"
+        );

        Ok(())
    }
@@ -756,10 +798,13 @@ impl PageServerHandler {
 }

 #[async_trait::async_trait]
-impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
+impl<IO> postgres_backend::Handler<IO> for PageServerHandler
+where
+    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+{
    fn check_auth_jwt(
        &mut self,
-        _pgb: &mut PostgresBackendTCP,
+        _pgb: &mut PostgresBackend<IO>,
        jwt_response: &[u8],
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
@@ -787,7 +832,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {

    fn startup(
        &mut self,
-        _pgb: &mut PostgresBackendTCP,
+        _pgb: &mut PostgresBackend<IO>,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
        Ok(())
@@ -795,7 +840,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {

    async fn process_query(
        &mut self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
        let ctx = self.connection_ctx.attached_child();
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -118,6 +118,10 @@ pub struct Tenant {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

+    /// The value creation timestamp, used to measure activation delay, see:
+    /// <https://github.com/neondatabase/neon/issues/4025>
+    loading_started_at: Instant,
+
    state: watch::Sender<TenantState>,

    // Overridden tenant-specific config parameters.
@@ -1476,7 +1480,7 @@ impl Tenant {
                TenantState::Loading | TenantState::Attaching => {
                    *current_state = TenantState::Active;

-                    info!("Activating tenant {}", self.tenant_id);
+                    debug!(tenant_id = %self.tenant_id, "Activating tenant");

                    let timelines_accessor = self.timelines.lock().unwrap();
                    let not_broken_timelines = timelines_accessor
@@ -1487,12 +1491,17 @@ impl Tenant {
                    // down when they notice that the tenant is inactive.
                    tasks::start_background_loops(self.tenant_id);

+                    let mut activated_timelines = 0;
+                    let mut timelines_broken_during_activation = 0;
+
                    for timeline in not_broken_timelines {
                        match timeline
                            .activate(ctx)
                            .context("timeline activation for activating tenant")
                        {
-                            Ok(()) => {}
+                            Ok(()) => {
+                                activated_timelines += 1;
+                            }
                            Err(e) => {
                                error!(
                                    "Failed to activate timeline {}: {:#}",
@@ -1503,9 +1512,26 @@ impl Tenant {
                                    "failed to activate timeline {}: {}",
                                    timeline.timeline_id, e
                                ));
+
+                                timelines_broken_during_activation += 1;
                            }
                        }
                    }
+
+                    let elapsed = self.loading_started_at.elapsed();
+                    let total_timelines = timelines_accessor.len();
+
+                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                    info!(
+                        since_creation_millis = elapsed.as_millis(),
+                        tenant_id = %self.tenant_id,
+                        activated_timelines,
+                        timelines_broken_during_activation,
+                        total_timelines,
+                        post_state = <&'static str>::from(&*current_state),
+                        "activation attempt finished"
+                    );
                }
            }
        });
@@ -1735,6 +1761,13 @@ impl Tenant {

    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        *self.tenant_conf.write().unwrap() = new_tenant_conf;
+        // Don't hold self.timelines.lock() during the notifies.
+        // There's no risk of deadlock right now, but there could be if we consolidate
+        // mutexes in struct Timeline in the future.
+        let timelines = self.list_timelines();
+        for timeline in timelines {
+            timeline.tenant_conf_updated();
+        }
    }

    fn create_timeline_data(
@@ -1805,6 +1838,9 @@ impl Tenant {
        Tenant {
            tenant_id,
            conf,
+            // using now here is good enough approximation to catch tenants with really long
+            // activation times.
+            loading_started_at: Instant::now(),
            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
            timelines: Mutex::new(HashMap::new()),
            gc_cs: tokio::sync::Mutex::new(()),
@@ -1887,7 +1923,7 @@ impl Tenant {
            .to_string();

            // Convert the config to a toml file.
-            conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
+            conf_content += &toml_edit::ser::to_string(&tenant_conf)?;

            let mut target_config_file = VirtualFile::open_with_options(
                target_config_path,
@@ -2815,6 +2851,9 @@ pub mod harness {
                trace_read_requests: Some(tenant_conf.trace_read_requests),
                eviction_policy: Some(tenant_conf.eviction_policy),
                min_resident_size_override: tenant_conf.min_resident_size_override,
+                evictions_low_residence_duration_metric_threshold: Some(
+                    tenant_conf.evictions_low_residence_duration_metric_threshold,
+                ),
            }
        }
    }
@@ -2847,7 +2886,13 @@ pub mod harness {
            };

            LOG_HANDLE.get_or_init(|| {
-                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
+                logging::init(
+                    logging::LogFormat::Test,
+                    // enable it in case in case the tests exercise code paths that use
+                    // debug_assert_current_span_has_tenant_and_timeline_id
+                    logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+                )
+                .expect("Failed to init test logging")
            });

            let repo_dir = PageServerConf::test_repo_dir(test_name);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -8,6 +8,8 @@
 //! We cannot use global or default config instead, because wrong settings
 //! may lead to a data loss.
 //!
+use anyhow::Context;
+use pageserver_api::models::{TenantConfigRequest, TenantCreateRequest};
 use serde::{Deserialize, Serialize};
 use std::num::NonZeroU64;
 use std::time::Duration;
@@ -39,6 +41,7 @@ pub mod defaults {
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }

 /// Per-tenant configuration options
@@ -93,6 +96,9 @@ pub struct TenantConf {
    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
    pub min_resident_size_override: Option<u64>,
+    // See the corresponding metric's help string.
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Duration,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -164,6 +170,11 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub min_resident_size_override: Option<u64>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -228,6 +239,9 @@ impl TenantConfOpt {
            min_resident_size_override: self
                .min_resident_size_override
                .or(global_conf.min_resident_size_override),
+            evictions_low_residence_duration_metric_threshold: self
+                .evictions_low_residence_duration_metric_threshold
+                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
        }
    }
 }
@@ -260,10 +274,187 @@ impl Default for TenantConf {
            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
        }
    }
 }

+// Helper function to standardize the error messages we produce on bad durations
+//
+// Intended to be used with anyhow's `with_context`, e.g.:
+//
+//   let value = result.with_context(bad_duration("name", &value))?;
+//
+fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
+    move || format!("Cannot parse `{field_name}` duration {value:?}")
+}
+
+impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
+    type Error = anyhow::Error;
+
+    fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
+        let mut tenant_conf = TenantConfOpt::default();
+
+        if let Some(gc_period) = &request_data.gc_period {
+            tenant_conf.gc_period = Some(
+                humantime::parse_duration(gc_period)
+                    .with_context(bad_duration("gc_period", gc_period))?,
+            );
+        }
+        tenant_conf.gc_horizon = request_data.gc_horizon;
+        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
+
+        if let Some(pitr_interval) = &request_data.pitr_interval {
+            tenant_conf.pitr_interval = Some(
+                humantime::parse_duration(pitr_interval)
+                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
+            );
+        }
+
+        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
+            tenant_conf.walreceiver_connect_timeout = Some(
+                humantime::parse_duration(walreceiver_connect_timeout).with_context(
+                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
+                )?,
+            );
+        }
+        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
+            tenant_conf.lagging_wal_timeout = Some(
+                humantime::parse_duration(lagging_wal_timeout)
+                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
+            );
+        }
+        if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
+            tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
+        }
+        if let Some(trace_read_requests) = request_data.trace_read_requests {
+            tenant_conf.trace_read_requests = Some(trace_read_requests);
+        }
+
+        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
+        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
+            tenant_conf.checkpoint_timeout = Some(
+                humantime::parse_duration(checkpoint_timeout)
+                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
+            );
+        }
+
+        tenant_conf.compaction_target_size = request_data.compaction_target_size;
+        tenant_conf.compaction_threshold = request_data.compaction_threshold;
+
+        if let Some(compaction_period) = &request_data.compaction_period {
+            tenant_conf.compaction_period = Some(
+                humantime::parse_duration(compaction_period)
+                    .with_context(bad_duration("compaction_period", compaction_period))?,
+            );
+        }
+
+        if let Some(eviction_policy) = &request_data.eviction_policy {
+            tenant_conf.eviction_policy = Some(
+                serde::Deserialize::deserialize(eviction_policy)
+                    .context("parse field `eviction_policy`")?,
+            );
+        }
+
+        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+
+        if let Some(evictions_low_residence_duration_metric_threshold) =
+            &request_data.evictions_low_residence_duration_metric_threshold
+        {
+            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
+                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
+                    .with_context(bad_duration(
+                        "evictions_low_residence_duration_metric_threshold",
+                        evictions_low_residence_duration_metric_threshold,
+                    ))?,
+            );
+        }
+
+        Ok(tenant_conf)
+    }
+}
+
+impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt {
+    type Error = anyhow::Error;
+
+    fn try_from(request_data: &TenantConfigRequest) -> Result<Self, Self::Error> {
+        let mut tenant_conf = TenantConfOpt::default();
+        if let Some(gc_period) = &request_data.gc_period {
+            tenant_conf.gc_period = Some(
+                humantime::parse_duration(gc_period)
+                    .with_context(bad_duration("gc_period", gc_period))?,
+            );
+        }
+        tenant_conf.gc_horizon = request_data.gc_horizon;
+        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
+
+        if let Some(pitr_interval) = &request_data.pitr_interval {
+            tenant_conf.pitr_interval = Some(
+                humantime::parse_duration(pitr_interval)
+                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
+            );
+        }
+        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
+            tenant_conf.walreceiver_connect_timeout = Some(
+                humantime::parse_duration(walreceiver_connect_timeout).with_context(
+                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
+                )?,
+            );
+        }
+        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
+            tenant_conf.lagging_wal_timeout = Some(
+                humantime::parse_duration(lagging_wal_timeout)
+                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
+            );
+        }
+        tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
+        tenant_conf.trace_read_requests = request_data.trace_read_requests;
+
+        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
+        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
+            tenant_conf.checkpoint_timeout = Some(
+                humantime::parse_duration(checkpoint_timeout)
+                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
+            );
+        }
+        tenant_conf.compaction_target_size = request_data.compaction_target_size;
+        tenant_conf.compaction_threshold = request_data.compaction_threshold;
+
+        if let Some(compaction_period) = &request_data.compaction_period {
+            tenant_conf.compaction_period = Some(
+                humantime::parse_duration(compaction_period)
+                    .with_context(bad_duration("compaction_period", compaction_period))?,
+            );
+        }
+
+        if let Some(eviction_policy) = &request_data.eviction_policy {
+            tenant_conf.eviction_policy = Some(
+                serde::Deserialize::deserialize(eviction_policy)
+                    .context("parse field `eviction_policy`")?,
+            );
+        }
+
+        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+
+        if let Some(evictions_low_residence_duration_metric_threshold) =
+            &request_data.evictions_low_residence_duration_metric_threshold
+        {
+            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
+                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
+                    .with_context(bad_duration(
+                        "evictions_low_residence_duration_metric_threshold",
+                        evictions_low_residence_duration_metric_threshold,
+                    ))?,
+            );
+        }
+
+        Ok(tenant_conf)
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -275,9 +466,9 @@ mod tests {
            ..TenantConfOpt::default()
        };

-        let toml_form = toml_edit::easy::to_string(&small_conf).unwrap();
+        let toml_form = toml_edit::ser::to_string(&small_conf).unwrap();
        assert_eq!(toml_form, "gc_horizon = 42\n");
-        assert_eq!(small_conf, toml_edit::easy::from_str(&toml_form).unwrap());
+        assert_eq!(small_conf, toml_edit::de::from_str(&toml_form).unwrap());

        let json_form = serde_json::to_string(&small_conf).unwrap();
        assert_eq!(json_form, "{\"gc_horizon\":42}");
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -48,11 +48,10 @@ mod layer_coverage;

 use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
-use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::{bail, Result};
+use anyhow::Result;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -126,7 +125,7 @@ where
    ///
    /// Insert an on-disk layer.
    ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
        self.layer_map.insert_historic_noflush(layer)
    }

@@ -274,22 +273,16 @@ where
    ///
    /// Helper function for BatchedUpdates::insert_historic
    ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
-        let key = historic_layer_coverage::LayerKey::from(&*layer);
-        if self.historic.contains(&key) {
-            bail!(
-                "Attempt to insert duplicate layer {} in layer map",
-                layer.short_id()
-            );
-        }
-        self.historic.insert(key, Arc::clone(&layer));
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            Arc::clone(&layer),
+        );

        if Self::is_l0(&layer) {
            self.l0_delta_layers.push(layer);
        }
-
-        NUM_ONDISK_LAYERS.inc();
-        Ok(())
    }

    ///
@@ -314,8 +307,6 @@ where
                "failed to locate removed historic layer from l0_delta_layers"
            );
        }
-
-        NUM_ONDISK_LAYERS.dec();
    }

    pub(self) fn replace_historic_noflush(
@@ -843,7 +834,7 @@ mod tests {

            let expected_in_counts = (1, usize::from(expected_l0));

-            map.batch_update().insert_historic(remote.clone()).unwrap();
+            map.batch_update().insert_historic(remote.clone());
            assert_eq!(count_layer_in(&map, &remote), expected_in_counts);

            let replaced = map
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -417,14 +417,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        }
    }

-    pub fn contains(&self, layer_key: &LayerKey) -> bool {
-        match self.buffer.get(layer_key) {
-            Some(None) => false,                         // layer remove was buffered
-            Some(_) => true,                             // layer insert was buffered
-            None => self.layers.contains_key(layer_key), // no buffered ops for this layer
-        }
-    }
-
    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
        self.buffer.insert(layer_key, Some(value));
    }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -219,7 +219,8 @@ use utils::lsn::Lsn;

 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
-    REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
+    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
@@ -367,9 +368,13 @@ impl RemoteTimelineClient {

    /// Download index file
    pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
-        let _unfinished_gauge_guard = self
-            .metrics
-            .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
+        let _unfinished_gauge_guard = self.metrics.call_begin(
+            &RemoteOpFileKind::Index,
+            &RemoteOpKind::Download,
+            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                reason: "no need for a downloads gauge",
+            },
+        );

        download::download_index_part(
            self.conf,
@@ -398,9 +403,13 @@ impl RemoteTimelineClient {
        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<u64> {
        let downloaded_size = {
-            let _unfinished_gauge_guard = self
-                .metrics
-                .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
+            let _unfinished_gauge_guard = self.metrics.call_begin(
+                &RemoteOpFileKind::Layer,
+                &RemoteOpKind::Download,
+                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                    reason: "no need for a downloads gauge",
+                },
+            );
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
@@ -886,11 +895,32 @@ impl RemoteTimelineClient {
    fn calls_unfinished_metric_impl(
        &self,
        op: &UploadOp,
-    ) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
+    ) -> Option<(
+        RemoteOpFileKind,
+        RemoteOpKind,
+        RemoteTimelineClientMetricsCallTrackSize,
+    )> {
+        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
        let res = match op {
-            UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
-            UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
-            UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
+            UploadOp::UploadLayer(_, m) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Upload,
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+            ),
+            UploadOp::UploadMetadata(_, _) => (
+                RemoteOpFileKind::Index,
+                RemoteOpKind::Upload,
+                DontTrackSize {
+                    reason: "metadata uploads are tiny",
+                },
+            ),
+            UploadOp::Delete(file_kind, _) => (
+                *file_kind,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
            UploadOp::Barrier(_) => {
                // we do not account these
                return None;
@@ -900,20 +930,20 @@ impl RemoteTimelineClient {
    }

    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
            Some(x) => x,
            None => return,
        };
-        let guard = self.metrics.call_begin(&file_kind, &op_kind);
+        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
    }

    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
            Some(x) => x,
            None => return,
        };
-        self.metrics.call_end(&file_kind, &op_kind);
+        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
    }

    fn stop(&self) {
@@ -981,11 +1011,19 @@ impl RemoteTimelineClient {
 mod tests {
    use super::*;
    use crate::{
-        tenant::harness::{TenantHarness, TIMELINE_ID},
+        context::RequestContext,
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            Tenant,
+        },
        DEFAULT_PG_VERSION,
    };
    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use std::{collections::HashSet, path::Path};
+    use std::{
+        collections::HashSet,
+        path::{Path, PathBuf},
+    };
+    use tokio::runtime::EnterGuard;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1034,39 +1072,80 @@ mod tests {
        assert_eq!(found, expected);
    }

+    struct TestSetup {
+        runtime: &'static tokio::runtime::Runtime,
+        entered_runtime: EnterGuard<'static>,
+        harness: TenantHarness<'static>,
+        tenant: Arc<Tenant>,
+        tenant_ctx: RequestContext,
+        remote_fs_dir: PathBuf,
+        client: Arc<RemoteTimelineClient>,
+    }
+
+    impl TestSetup {
+        fn new(test_name: &str) -> anyhow::Result<Self> {
+            // Use a current-thread runtime in the test
+            let runtime = Box::leak(Box::new(
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()?,
+            ));
+            let entered_runtime = runtime.enter();
+
+            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
+            let harness = TenantHarness::create(test_name)?;
+            let (tenant, ctx) = runtime.block_on(harness.load());
+            // create an empty timeline directory
+            let timeline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let _ = timeline.initialize(&ctx).unwrap();
+
+            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+            std::fs::create_dir_all(remote_fs_dir)?;
+            let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+
+            let storage_config = RemoteStorageConfig {
+                max_concurrent_syncs: std::num::NonZeroUsize::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                )
+                .unwrap(),
+                max_sync_errors: std::num::NonZeroU32::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                )
+                .unwrap(),
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            };
+
+            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+            let client = Arc::new(RemoteTimelineClient {
+                conf: harness.conf,
+                runtime,
+                tenant_id: harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                storage_impl: storage,
+                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &harness.tenant_id,
+                    &TIMELINE_ID,
+                )),
+            });
+
+            Ok(Self {
+                runtime,
+                entered_runtime,
+                harness,
+                tenant,
+                tenant_ctx: ctx,
+                remote_fs_dir,
+                client,
+            })
+        }
+    }
+
    // Test scheduling
    #[test]
    fn upload_scheduling() -> anyhow::Result<()> {
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
-        let harness = TenantHarness::create("upload_scheduling")?;
-        let (tenant, ctx) = runtime.block_on(harness.load());
-        let _timeline =
-            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-
-        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-
-        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-        };
-
        // Test outline:
        //
        // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1081,21 +1160,19 @@ mod tests {
        // Schedule another deletion. Check that it's launched immediately.
        // Schedule index upload. Check that it's queued

-        println!("workdir: {}", harness.conf.workdir.display());
-
-        let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
-        let client = Arc::new(RemoteTimelineClient {
-            conf: harness.conf,
+        let TestSetup {
            runtime,
-            tenant_id: harness.tenant_id,
-            timeline_id: TIMELINE_ID,
-            storage_impl,
-            upload_queue: Mutex::new(UploadQueue::Uninitialized),
-            metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                &harness.tenant_id,
-                &TIMELINE_ID,
-            )),
-        });
+            entered_runtime: _entered_runtime,
+            harness,
+            tenant: _tenant,
+            tenant_ctx: _tenant_ctx,
+            remote_fs_dir,
+            client,
+        } = TestSetup::new("upload_scheduling").unwrap();
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        println!("workdir: {}", harness.conf.workdir.display());

        let remote_timeline_dir =
            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
@@ -1216,4 +1293,90 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+        // Setup
+
+        let TestSetup {
+            runtime,
+            harness,
+            client,
+            ..
+        } = TestSetup::new("metrics")?;
+
+        let metadata = dummy_metadata(Lsn(0x10));
+        client.init_upload_queue_for_empty_remote(&metadata)?;
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        std::fs::write(
+            timeline_path.join(layer_file_name_1.file_name()),
+            &content_1,
+        )?;
+
+        #[derive(Debug, PartialEq)]
+        struct BytesStartedFinished {
+            started: Option<usize>,
+            finished: Option<usize>,
+        }
+        let get_bytes_started_stopped = || {
+            let started = client
+                .metrics
+                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            let stopped = client
+                .metrics
+                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            BytesStartedFinished {
+                started,
+                finished: stopped,
+            }
+        };
+
+        // Test
+
+        let init = get_bytes_started_stopped();
+
+        client.schedule_layer_file_upload(
+            &layer_file_name_1,
+            &LayerFileMetadata::new(content_1.len() as u64),
+        )?;
+
+        let pre = get_bytes_started_stopped();
+
+        runtime.block_on(client.wait_completion())?;
+
+        let post = get_bytes_started_stopped();
+
+        // Validate
+
+        assert_eq!(
+            init,
+            BytesStartedFinished {
+                started: None,
+                finished: None
+            }
+        );
+        assert_eq!(
+            pre,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                // assert that the _finished metric is created eagerly so that subtractions work on first sample
+                finished: Some(0),
+            }
+        );
+        assert_eq!(
+            post,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len())
+            }
+        );
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,6 +16,7 @@ use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
@@ -43,6 +44,8 @@ pub async fn download_layer_file<'a>(
    layer_file_name: &'a LayerFileName,
    layer_metadata: &'a LayerFileMetadata,
 ) -> Result<u64, DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);

    let local_path = timeline_path.join(layer_file_name.file_name());
@@ -154,7 +157,7 @@ pub async fn download_layer_file<'a>(
        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

-    tracing::info!("download complete: {}", local_path.display());
+    tracing::debug!("download complete: {}", local_path.display());

    Ok(bytes_amount)
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -74,7 +74,7 @@ pub(super) async fn upload_timeline_layer<'a>(
    })?;

    storage
-        .upload(Box::new(source_file), fs_size, &storage_path, None)
+        .upload(source_file, fs_size, &storage_path, None)
        .await
        .with_context(|| {
            format!(
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -13,9 +13,9 @@ use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
 use bytes::Bytes;
-use either::Either;
 use enum_map::EnumMap;
 use enumset::EnumSet;
+use once_cell::sync::Lazy;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
@@ -23,8 +23,10 @@ use pageserver_api::models::{
 use std::ops::Range;
 use std::path::PathBuf;
 use std::sync::{Arc, Mutex};
-use std::time::{SystemTime, UNIX_EPOCH};
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
+use utils::rate_limit::RateLimit;

 use utils::{
    id::{TenantId, TimelineId},
@@ -37,6 +39,8 @@ pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use remote_layer::RemoteLayer;

+use super::layer_map::BatchedUpdates;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
    T: PartialOrd<T>,
@@ -158,41 +162,82 @@ impl LayerAccessStatFullDetails {
 }

 impl LayerAccessStats {
-    pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
-        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
-        new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
-        new
+    /// Create an empty stats object.
+    ///
+    /// The caller is responsible for recording a residence event
+    /// using [`record_residence_event`] before calling `latest_activity`.
+    /// If they don't, [`latest_activity`] will return `None`.
+    pub(crate) fn empty_will_record_residence_event_later() -> Self {
+        LayerAccessStats(Mutex::default())
    }

-    pub(crate) fn for_new_layer_file() -> Self {
+    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    pub(crate) fn for_loading_layer<L>(
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
+        status: LayerResidenceStatus,
+    ) -> Self
+    where
+        L: ?Sized + Layer,
+    {
        let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
        new.record_residence_event(
-            LayerResidenceStatus::Resident,
-            LayerResidenceEventReason::LayerCreate,
+            layer_map_lock_held_witness,
+            status,
+            LayerResidenceEventReason::LayerLoad,
        );
        new
    }

    /// Creates a clone of `self` and records `new_status` in the clone.
-    /// The `new_status` is not recorded in `self`
-    pub(crate) fn clone_for_residence_change(
+    ///
+    /// The `new_status` is not recorded in `self`.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    pub(crate) fn clone_for_residence_change<L>(
        &self,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
+    ) -> LayerAccessStats
+    where
+        L: ?Sized + Layer,
+    {
        let clone = {
            let inner = self.0.lock().unwrap();
            inner.clone()
        };
        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
+        new.record_residence_event(
+            layer_map_lock_held_witness,
+            new_status,
+            LayerResidenceEventReason::ResidenceChange,
+        );
        new
    }

-    fn record_residence_event(
+    /// Record a change in layer residency.
+    ///
+    /// Recording the event must happen while holding the layer map lock to
+    /// ensure that latest-activity-threshold-based layer eviction (eviction_task.rs)
+    /// can do an "imitate access" to this layer, before it observes `now-latest_activity() > threshold`.
+    ///
+    /// If we instead recorded the residence event with a timestamp from before grabbing the layer map lock,
+    /// the following race could happen:
+    ///
+    /// - Compact: Write out an L1 layer from several L0 layers. This records residence event LayerCreate with the current timestamp.
+    /// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
+    /// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
+    /// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
+    ///
+    pub(crate) fn record_residence_event<L>(
        &self,
+        _layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        status: LayerResidenceStatus,
        reason: LayerResidenceEventReason,
-    ) {
+    ) where
+        L: ?Sized + Layer,
+    {
        let mut locked = self.0.lock().unwrap();
        locked.iter_mut().for_each(|inner| {
            inner
@@ -255,24 +300,37 @@ impl LayerAccessStats {
        ret
    }

-    fn most_recent_access_or_residence_event(
-        &self,
-    ) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
+    /// Get the latest access timestamp, falling back to latest residence event.
+    ///
+    /// This function can only return `None` if there has not yet been a call to the
+    /// [`record_residence_event`] method. That would generally be considered an
+    /// implementation error. This function logs a rate-limited warning in that case.
+    ///
+    /// TODO: use type system to avoid the need for `fallback`.
+    /// The approach in https://github.com/neondatabase/neon/pull/3775
+    /// could be used to enforce that a residence event is recorded
+    /// before a layer is added to the layer map. We could also have
+    /// a layer wrapper type that holds the LayerAccessStats, and ensure
+    /// that that type can only be produced by inserting into the layer map.
+    pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
        let locked = self.0.lock().unwrap();
        let inner = &locked.for_eviction_policy;
        match inner.last_accesses.recent() {
-            Some(a) => Either::Left(*a),
+            Some(a) => Some(a.when),
            None => match inner.last_residence_changes.recent() {
-                Some(e) => Either::Right(e.clone()),
-                None => unreachable!("constructors for LayerAccessStats ensure that there's always a residence change event"),
-            }
-        }
-    }
-
-    pub(crate) fn latest_activity(&self) -> SystemTime {
-        match self.most_recent_access_or_residence_event() {
-            Either::Left(mra) => mra.when,
-            Either::Right(re) => re.timestamp,
+                Some(e) => Some(e.timestamp),
+                None => {
+                    static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
+                        Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
+                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
+                    guard.0 += 1;
+                    let occurences = guard.0;
+                    guard.1.call(move || {
+                        warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
+                    });
+                    None
+                }
+            },
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -57,7 +57,7 @@ use utils::{

 use super::{
    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
-    LayerKeyIter, LayerResidenceStatus, PathOrConf,
+    LayerKeyIter, PathOrConf,
 };

 ///
@@ -637,7 +637,7 @@ impl DeltaLayer {
            key_range: summary.key_range,
            lsn_range: summary.lsn_range,
            file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                file: None,
@@ -808,7 +808,7 @@ impl DeltaLayerWriterInner {
            key_range: self.key_start..key_end,
            lsn_range: self.lsn_range.clone(),
            file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_new_layer_file(),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
                file: None,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -53,7 +53,7 @@ use utils::{
 };

 use super::filename::{ImageFileName, LayerFileName};
-use super::{Layer, LayerAccessStatsReset, LayerIter, LayerResidenceStatus, PathOrConf};
+use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};

 ///
 /// Header stored in the beginning of the file
@@ -438,7 +438,7 @@ impl ImageLayer {
            key_range: summary.key_range,
            lsn: summary.lsn,
            file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                file: None,
                loaded: false,
@@ -598,7 +598,7 @@ impl ImageLayerWriterInner {
            key_range: self.key_range.clone(),
            lsn: self.lsn,
            file_size: metadata.len(),
-            access_stats: LayerAccessStats::for_new_layer_file(),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(ImageLayerInner {
                loaded: false,
                file: None,
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -4,6 +4,7 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
+use crate::tenant::layer_map::BatchedUpdates;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use anyhow::{bail, Result};
@@ -246,11 +247,15 @@ impl RemoteLayer {
    }

    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub fn create_downloaded_layer(
+    pub fn create_downloaded_layer<L>(
        &self,
+        layer_map_lock_held_witness: &BatchedUpdates<'_, L>,
        conf: &'static PageServerConf,
        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
+    ) -> Arc<dyn PersistentLayer>
+    where
+        L: ?Sized + Layer,
+    {
        if self.is_delta {
            let fname = DeltaFileName {
                key_range: self.key_range.clone(),
@@ -262,8 +267,10 @@ impl RemoteLayer {
                self.tenantid,
                &fname,
                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+                self.access_stats.clone_for_residence_change(
+                    layer_map_lock_held_witness,
+                    LayerResidenceStatus::Resident,
+                ),
            ))
        } else {
            let fname = ImageFileName {
@@ -276,8 +283,10 @@ impl RemoteLayer {
                self.tenantid,
                &fname,
                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+                self.access_stats.clone_for_residence_change(
+                    layer_map_lock_held_witness,
+                    LayerResidenceStatus::Resident,
+                ),
            ))
        }
    }
--- a/Show More
+++ b/Show More