From 8fcba150db3cdd0de74837896bf76c0317b046d5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 14 Dec 2022 18:05:05 +0000
Subject: [PATCH 001/132] test_seqscans: temporarily disable remote test
 (#3101)

Temporarily disable `test_seqscans` for remote projects; they acquire
too much space and time. We can try to reenable it back after switching
to per-test projects.
---
 test_runner/performance/test_seqscans.py              | 11 ++++++-----
 test_runner/regress/test_old_request_lsn.py           |  2 +-
 test_runner/regress/test_pageserver_restart.py        |  4 ++--
 .../regress/test_walredo_not_left_behind_on_detach.py |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py
index a61d64553d..bd84724405 100644
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -22,15 +22,16 @@ from pytest_lazyfixture import lazy_fixture  # type: ignore
     ],
 )
 @pytest.mark.parametrize(
-    "env, scale",
+    "env,scale",
     [
         # Run on all envs. Use 200x larger table on remote cluster to make sure
         # it doesn't fit in shared buffers, which are larger on remote than local.
         pytest.param(lazy_fixture("neon_compare"), 1, id="neon"),
         pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"),
-        pytest.param(
-            lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster
-        ),
+        # Reenable after switching per-test projects created via API
+        # pytest.param(
+        #     lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster
+        # ),
     ],
 )
 def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int):
@@ -45,7 +46,7 @@ def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: in
             # Verify that the table is larger than shared_buffers
             cur.execute(
                 """
-            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize
+            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_size
             from pg_settings where name = 'shared_buffers'
             """
             )
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 3e387bb6cc..1e81d8ba60 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -45,7 +45,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # will cause GetPage requests.
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index e48815906b..6388e979e5 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -32,7 +32,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     # Verify that the table is larger than shared_buffers
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )
@@ -115,7 +115,7 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
             # Verify that the table is larger than shared_buffers
             cur.execute(
                 """
-            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+            select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
             from pg_settings where name = 'shared_buffers'
             """
             )
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index aaaa8893a5..24045e2eb7 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -65,7 +65,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
     # Verify that the table is larger than shared_buffers
     cur.execute(
         """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
         from pg_settings where name = 'shared_buffers'
     """
     )

From 4132ae9dfeefbdcca27f79a04779db8f7f6d164f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 13 Dec 2022 12:23:47 -0500
Subject: [PATCH 002/132] always remove RemoteTimelineClient's metrics when
 dropping it

---
 pageserver/src/metrics.rs                  | 108 ++++++++++++++++++---
 pageserver/src/storage_sync2.rs            |  25 +++--
 test_runner/fixtures/metrics.py            |   8 ++
 test_runner/regress/test_remote_storage.py |   7 +-
 test_runner/regress/test_tenants.py        |  26 ++++-
 5 files changed, 144 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 454ff01f0e..2f1a98e4c5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -201,7 +201,7 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
 
 // remote storage metrics
 
-pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
+static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_remote_upload_queue_unfinished_tasks",
         "Number of tasks in the upload queue that are not finished yet.",
@@ -210,14 +210,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|
     .expect("failed to define a metric")
 });
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
     Download,
     Delete,
 }
 impl RemoteOpKind {
-    pub fn as_str(&self) -> &str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             Self::Upload => "upload",
             Self::Download => "download",
@@ -226,13 +226,13 @@ impl RemoteOpKind {
     }
 }
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
 pub enum RemoteOpFileKind {
     Layer,
     Index,
 }
 impl RemoteOpFileKind {
-    pub fn as_str(&self) -> &str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             Self::Layer => "layer",
             Self::Index => "index",
@@ -491,10 +491,94 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
 
 use futures::Future;
 use pin_project_lite::pin_project;
+use std::collections::HashMap;
 use std::pin::Pin;
+use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::Instant;
 
+pub struct RemoteTimelineClientMetrics {
+    tenant_id: String,
+    timeline_id: String,
+    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
+    unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+}
+
+impl RemoteTimelineClientMetrics {
+    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+        RemoteTimelineClientMetrics {
+            tenant_id: tenant_id.to_string(),
+            timeline_id: timeline_id.to_string(),
+            remote_operation_time: Mutex::new(HashMap::default()),
+            unfinished_tasks: Mutex::new(HashMap::default()),
+        }
+    }
+    pub fn remote_operation_time(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+        status: &'static str,
+    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.remote_operation_time.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str(), status);
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_OPERATION_TIME
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                    key.2,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+    pub fn unfinished_tasks(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntGauge {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.unfinished_tasks.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+}
+
+impl Drop for RemoteTimelineClientMetrics {
+    fn drop(&mut self) {
+        let RemoteTimelineClientMetrics {
+            tenant_id,
+            timeline_id,
+            remote_operation_time,
+            unfinished_tasks,
+        } = self;
+        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
+            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
+        }
+        for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
+            let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+    }
+}
+
 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
 pub trait MeasureRemoteOp: Sized {
@@ -504,6 +588,7 @@ pub trait MeasureRemoteOp: Sized {
         timeline_id: TimelineId,
         file_kind: RemoteOpFileKind,
         op: RemoteOpKind,
+        metrics: Arc<RemoteTimelineClientMetrics>,
     ) -> MeasuredRemoteOp<Self> {
         let start = Instant::now();
         MeasuredRemoteOp {
@@ -513,6 +598,7 @@ pub trait MeasureRemoteOp: Sized {
             file_kind,
             op,
             start,
+            metrics,
         }
     }
 }
@@ -529,6 +615,7 @@ pin_project! {
         file_kind: RemoteOpFileKind,
         op: RemoteOpKind,
         start: Instant,
+        metrics: Arc<RemoteTimelineClientMetrics>,
     }
 }
 
@@ -541,15 +628,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
         if let Poll::Ready(ref res) = poll_result {
             let duration = this.start.elapsed();
             let status = if res.is_ok() { &"success" } else { &"failure" };
-            REMOTE_OPERATION_TIME
-                .get_metric_with_label_values(&[
-                    &this.tenant_id.to_string(),
-                    &this.timeline_id.to_string(),
-                    this.file_kind.as_str(),
-                    this.op.as_str(),
-                    status,
-                ])
-                .unwrap()
+            this.metrics
+                .remote_operation_time(this.file_kind, this.op, status)
                 .observe(duration.as_secs_f64());
         }
         poll_result
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 7cc0eac2bf..cebec4d615 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -210,10 +210,9 @@ use utils::lsn::Lsn;
 
 use self::index::IndexPart;
 
-use crate::metrics::MeasureRemoteOp;
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
-use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
+use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
 use crate::tenant::filename::LayerFileName;
 use crate::{
     config::PageServerConf,
@@ -256,6 +255,8 @@ pub struct RemoteTimelineClient {
 
     upload_queue: Mutex<UploadQueue>,
 
+    metrics: Arc<RemoteTimelineClientMetrics>,
+
     storage_impl: GenericRemoteStorage,
 }
 
@@ -501,6 +502,7 @@ impl RemoteTimelineClient {
             self.timeline_id,
             RemoteOpFileKind::Index,
             RemoteOpKind::Download,
+            Arc::clone(&self.metrics),
         )
         .await
     }
@@ -528,6 +530,7 @@ impl RemoteTimelineClient {
             self.timeline_id,
             RemoteOpFileKind::Layer,
             RemoteOpKind::Download,
+            Arc::clone(&self.metrics),
         )
         .await?;
 
@@ -847,6 +850,7 @@ impl RemoteTimelineClient {
                         self.timeline_id,
                         RemoteOpFileKind::Layer,
                         RemoteOpKind::Upload,
+                        Arc::clone(&self.metrics),
                     )
                     .await
                 }
@@ -863,6 +867,7 @@ impl RemoteTimelineClient {
                         self.timeline_id,
                         RemoteOpFileKind::Index,
                         RemoteOpKind::Upload,
+                        Arc::clone(&self.metrics),
                     )
                     .await
                 }
@@ -877,6 +882,7 @@ impl RemoteTimelineClient {
                             self.timeline_id,
                             *metric_file_kind,
                             RemoteOpKind::Delete,
+                            Arc::clone(&self.metrics),
                         )
                         .await
                 }
@@ -977,14 +983,8 @@ impl RemoteTimelineClient {
                 return;
             }
         };
-        REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
-            .get_metric_with_label_values(&[
-                &self.tenant_id.to_string(),
-                &self.timeline_id.to_string(),
-                file_kind.as_str(),
-                op_kind.as_str(),
-            ])
-            .unwrap()
+        self.metrics
+            .unfinished_tasks(&file_kind, &op_kind)
             .add(delta)
     }
 
@@ -1068,6 +1068,7 @@ pub fn create_remote_timeline_client(
         timeline_id,
         storage_impl: remote_storage,
         upload_queue: Mutex::new(UploadQueue::Uninitialized),
+        metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
     })
 }
 
@@ -1180,6 +1181,10 @@ mod tests {
             timeline_id: TIMELINE_ID,
             storage_impl,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                &harness.tenant_id,
+                &TIMELINE_ID,
+            )),
         });
 
         let remote_timeline_dir =
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 86ab4425ed..17b2b71df2 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -39,6 +39,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
     return metrics
 
 
+PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
+    "pageserver_remote_upload_queue_unfinished_tasks",
+    "pageserver_remote_operation_seconds_bucket",
+    "pageserver_remote_operation_seconds_count",
+    "pageserver_remote_operation_seconds_sum",
+)
+
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
     "pageserver_current_physical_size",
@@ -62,4 +69,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
+    *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 7152bc8b6a..d8f8298fa6 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -384,7 +384,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             metrics,
             re.MULTILINE,
         )
-        assert matches
+        if matches is None:
+            return None
         return int(matches[1])
 
     pg = env.postgres.create_start("main", tenant_id=tenant_id)
@@ -436,8 +437,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
 
     assert not timeline_path.exists()
 
-    # timeline deletion should kill ongoing uploads
-    assert get_queued_count(file_kind="index", op_kind="upload") == 0
+    # timeline deletion should kill ongoing uploads, so, the metric will be gone
+    assert get_queued_count(file_kind="index", op_kind="upload") is None
 
     # timeline deletion should be unblocking checkpoint ops
     checkpoint_thread.join(2.0)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 0b20afefc3..9477ae3c25 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -7,7 +7,11 @@ from typing import List
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics
+from fixtures.metrics import (
+    PAGESERVER_PER_TENANT_METRICS,
+    PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
+    parse_metrics,
+)
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
@@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
         )
 
 
-def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "remote_storage_kind",
+    # exercise both the code paths where remote_storage=None and remote_storage=Some(...)
+    [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3],
+)
+def test_pageserver_metrics_removed_after_detach(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
     """Tests that when a tenant is detached, the tenant specific metrics are not left behind"""
 
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_pageserver_metrics_removed_after_detach",
+    )
+
     neon_env_builder.num_safekeepers = 3
 
     env = neon_env_builder.init_start()
@@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde
 
     for tenant in [tenant_1, tenant_2]:
         pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)])
-        assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS)
+        expected = set(PAGESERVER_PER_TENANT_METRICS)
+        if remote_storage_kind == RemoteStorageKind.NOOP:
+            # if there's no remote storage configured, we don't expose the remote timeline client metrics
+            expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS)
+        assert pre_detach_samples == expected
 
         env.pageserver.http_client().tenant_detach(tenant)
 

From c04c201520c30f56700626365788bf2e28395040 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 14 Dec 2022 21:28:14 +0100
Subject: [PATCH 003/132] Push proxy metrics to Victoria Metrics (#3106)

---
 .../dev-eu-west-1-zeta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-link.yaml   | 25 +++++++++++++++++++
 ...s-east-2-beta.neon-proxy-scram-legacy.yaml | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/neon-stress.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/neon-stress.proxy.yaml    | 25 +++++++++++++++++++
 ...-southeast-1-epsilon.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...d-eu-central-1-gamma.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...prod-us-east-2-delta.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 .../prod-us-west-2-eta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/production.proxy-scram.yaml   | 25 +++++++++++++++++++
 .github/helm-values/production.proxy.yaml     | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy.yaml        | 25 +++++++++++++++++++
 14 files changed, 350 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index f89eea5972..ae9c1f2e40 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eeb025277b..093fac146a 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -38,3 +38,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index ed710bc196..a2f932e4fb 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index ba0109c1eb..1138536e94 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
index dea47304a0..ed580349fc 100644
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -25,3 +25,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
index c3ecf6c743..94270ced09 100644
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ b/.github/helm-values/neon-stress.proxy.yaml
@@ -34,3 +34,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index a37a37406c..4e4aff1f9e 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 69d00a7e9c..94290a87e1 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 19d91fa4dc..1a4023708b 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index f148188c48..2942d6a2aa 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index 399bc6d21b..c7143cd61a 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -23,3 +23,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml
index 9db68c1044..dbaf3cd096 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/production.proxy.yaml
@@ -32,3 +32,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
index f249df3612..66f9921c9a 100644
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ b/.github/helm-values/staging.proxy-scram.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
index 62b4c4a595..a22082e625 100644
--- a/.github/helm-values/staging.proxy.yaml
+++ b/.github/helm-values/staging.proxy.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"

From bf3ac2be2d5cdff317ddd105dd68d13523382b19 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Dec 2022 08:53:54 -0500
Subject: [PATCH 004/132] add remote_physical_size metric

We do the accounting exclusively after updating remote IndexPart successfully.
This is cleaner & more robust than doing it upon completion of
individual layer file uploads / deletions since we can uset .set()
insteaf of add()/sub().

NB: Originally, this work was intended to be part of #3013 but it
turns out that it's completely orthogonal.
So, spin it out into this PR for easier review.
Since this change is additive, it won't break anything.
---
 pageserver/src/metrics.rs             | 30 +++++++++++++++++++++++++
 pageserver/src/storage_sync2.rs       | 32 +++++++++++++++++++++++++--
 pageserver/src/storage_sync2/index.rs |  2 +-
 test_runner/fixtures/metrics.py       |  1 +
 4 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 2f1a98e4c5..308f9cd4eb 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -96,6 +96,16 @@ static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_remote_physical_size",
+        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
+        // Corollary: If any files are missing from the index part, they won't be included here.
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_current_logical_size",
@@ -500,6 +510,7 @@ use std::time::Instant;
 pub struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
+    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
     remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
     unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
 }
@@ -511,8 +522,22 @@ impl RemoteTimelineClientMetrics {
             timeline_id: timeline_id.to_string(),
             remote_operation_time: Mutex::new(HashMap::default()),
             unfinished_tasks: Mutex::new(HashMap::default()),
+            remote_physical_size_gauge: Mutex::new(None),
         }
     }
+    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard
+            .get_or_insert_with(|| {
+                REMOTE_PHYSICAL_SIZE
+                    .get_metric_with_label_values(&[
+                        &self.tenant_id.to_string(),
+                        &self.timeline_id.to_string(),
+                    ])
+                    .unwrap()
+            })
+            .clone()
+    }
     pub fn remote_operation_time(
         &self,
         file_kind: &RemoteOpFileKind,
@@ -562,6 +587,7 @@ impl Drop for RemoteTimelineClientMetrics {
         let RemoteTimelineClientMetrics {
             tenant_id,
             timeline_id,
+            remote_physical_size_gauge,
             remote_operation_time,
             unfinished_tasks,
         } = self;
@@ -576,6 +602,10 @@ impl Drop for RemoteTimelineClientMetrics {
                 b,
             ]);
         }
+        {
+            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
+            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        }
     }
 }
 
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index cebec4d615..89bbc34227 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -460,6 +460,7 @@ impl RemoteTimelineClient {
     pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        self.update_remote_physical_size_gauge(Some(index_part));
         Ok(())
     }
 
@@ -471,6 +472,7 @@ impl RemoteTimelineClient {
     ) -> anyhow::Result<()> {
         let mut upload_queue = self.upload_queue.lock().unwrap();
         upload_queue.initialize_empty_remote(local_metadata)?;
+        self.update_remote_physical_size_gauge(None);
         Ok(())
     }
 
@@ -482,6 +484,20 @@ impl RemoteTimelineClient {
         }
     }
 
+    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
+        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
+            current_remote_index_part
+                .layer_metadata
+                .iter()
+                // If we don't have the file size for the layer, don't account for it in the metric.
+                .map(|(_, ilmd)| ilmd.file_size.unwrap_or(0))
+                .sum()
+        } else {
+            0
+        };
+        self.metrics.remote_physical_size_gauge().set(size);
+    }
+
     //
     // Download operations.
     //
@@ -543,6 +559,14 @@ impl RemoteTimelineClient {
             let upload_queue = guard.initialized_mut()?;
             if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
                 upgraded.merge(&new_metadata);
+                // If we don't do an index file upload inbetween here and restart,
+                // the value will go back down after pageserver restart, since we will
+                // have lost this data point.
+                // But, we upload index part fairly frequently, and restart pageserver rarely.
+                // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
+                self.metrics
+                    .remote_physical_size_gauge()
+                    .add(downloaded_size);
             } else {
                 // The file should exist, since we just downloaded it.
                 warn!(
@@ -855,7 +879,7 @@ impl RemoteTimelineClient {
                     .await
                 }
                 UploadOp::UploadMetadata(ref index_part, _lsn) => {
-                    upload::upload_index_part(
+                    let res = upload::upload_index_part(
                         self.conf,
                         &self.storage_impl,
                         self.tenant_id,
@@ -869,7 +893,11 @@ impl RemoteTimelineClient {
                         RemoteOpKind::Upload,
                         Arc::clone(&self.metrics),
                     )
-                    .await
+                    .await;
+                    if res.is_ok() {
+                        self.update_remote_physical_size_gauge(Some(index_part));
+                    }
+                    res
                 }
                 UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
                     let path = &self
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index 82487339ee..ed4ed10189 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -232,7 +232,7 @@ impl IndexPart {
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
-    file_size: Option<u64>,
+    pub(super) file_size: Option<u64>,
 }
 
 impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 17b2b71df2..5fe6c43528 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -44,6 +44,7 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_operation_seconds_bucket",
     "pageserver_remote_operation_seconds_count",
     "pageserver_remote_operation_seconds_sum",
+    "pageserver_remote_physical_size",
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (

From 10cd64cf8dd8b4280882ac3ba0d89182ac1b3b14 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Dec 2022 19:26:06 +0100
Subject: [PATCH 005/132] make TaskHandle::next_task_event cancellation-safe

If we get cancelled before jh.await returns we've take()n the join handle but
drop the result on the floor.

Fix it by setting self.join_handle = None after the .await

fixes https://github.com/neondatabase/neon/issues/3104
---
 pageserver/src/walreceiver.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs
index e627e9ecd0..74ede7c213 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -126,15 +126,21 @@ impl<E: Clone> TaskHandle<E> {
         match self.events_receiver.changed().await {
             Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
             Err(_task_channel_part_dropped) => {
-                TaskEvent::End(match self.join_handle.take() {
+                TaskEvent::End(match self.join_handle.as_mut() {
                     Some(jh) => {
                         if !jh.is_finished() {
                             warn!("sender is dropped while join handle is still alive");
                         }
 
-                        jh.await
+                        let res = jh
+                            .await
                             .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
-                            .and_then(|x| x)
+                            .and_then(|x| x);
+
+                        // For cancellation-safety, drop join_handle only after successful .await.
+                        self.join_handle = None;
+
+                        res
                     }
                     None => {
                         // Another option is to have an enum, join handle or result and give away the reference to it

From 397b60feabd132cfe4401e8b4f2c1cf11c25a71c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 6 Dec 2022 10:27:26 -0500
Subject: [PATCH 006/132] common abstraction for waiting for SK commit_lsn to
 reach PS

---
 test_runner/fixtures/neon_fixtures.py         | 31 +++++++++++++++++++
 .../test_tenants_with_remote_storage.py       | 14 +++------
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 818853a4ac..3a3ee94425 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3084,3 +3084,34 @@ def fork_at_current_lsn(
     """
     current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
     return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
+
+
+def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    safekeepers: List[Safekeeper],
+    pageserver: NeonPageserver,
+):
+    sk_commit_lsns = [
+        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers
+    ]
+    lsn = max(sk_commit_lsns)
+    ps_http = pageserver.http_client()
+    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn)
+    return lsn
+
+
+def wait_for_sk_commit_lsn_to_reach_remote_storage(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    safekeepers: List[Safekeeper],
+    pageserver: NeonPageserver,
+):
+    lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
+        tenant_id, timeline_id, safekeepers, pageserver
+    )
+    ps_http = pageserver.http_client()
+    # force a checkpoint to trigger upload
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
+    return lsn
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index afc413f3e3..57aaa70559 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -24,6 +24,7 @@ from fixtures.neon_fixtures import (
     assert_no_in_progress_downloads_for_tenant,
     available_remote_storages,
     wait_for_last_record_lsn,
+    wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -161,16 +162,9 @@ def test_tenants_attached_after_download(
     ##### Stop the pageserver, erase its layer file to force it being downloaded from S3
     env.postgres.stop_all()
 
-    sk_commit_lsns = [
-        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn
-        for sk in env.safekeepers
-    ]
-    log.info("wait for pageserver to process all the WAL")
-    wait_for_last_record_lsn(client, tenant_id, timeline_id, max(sk_commit_lsns))
-    log.info("wait for it to reach remote storage")
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(client, tenant_id, timeline_id, max(sk_commit_lsns))
-    log.info("latest safekeeper_commit_lsn reached remote storage")
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
 
     detail_before = client.timeline_detail(
         tenant_id, timeline_id, include_non_incremental_physical_size=True

From 807b110946ee603aa64b363b6041f75edd822f97 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 15 Dec 2022 18:06:17 +0100
Subject: [PATCH 007/132] Update Makefile configuration: (#3011)

- Use only one templated section for most postgres-versioned steps
- Clean up neon_walredo, too, when running neon-pg-ext-clean
- Depend on the various cleanup steps for `clean` instead of manually
executing those cleanup steps.
---
 Makefile | 199 +++++++++++++++++++++++--------------------------------
 1 file changed, 84 insertions(+), 115 deletions(-)

diff --git a/Makefile b/Makefile
index 4711dc1c7d..92a4532684 100644
--- a/Makefile
+++ b/Makefile
@@ -61,146 +61,115 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-v14-headers postgres-v15-headers
+neon: postgres-headers
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
 
 ### PostgreSQL parts
-# The rules are duplicated for Postgres v14 and 15. We may want to refactor
+# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
 # to avoid the duplication in the future, but it's tolerable for now.
 #
-$(POSTGRES_INSTALL_DIR)/build/v14/config.status:
-	+@echo "Configuring Postgres v14 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+$(POSTGRES_INSTALL_DIR)/build/%/config.status:
+	+@echo "Configuring Postgres $* build"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
+	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
 		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
-
-$(POSTGRES_INSTALL_DIR)/build/v15/config.status:
-	+@echo "Configuring Postgres v15 build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
-		CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
 
 # nicer alias to run 'configure'
-.PHONY: postgres-v14-configure
-postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
-
-.PHONY: postgres-v15-configure
-postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+# Note: I've been unable to use templates for this part of our configuration.
+# I'm not sure why it wouldn't work, but this is the only place (apart from
+# the "build-all-versions" entry points) where direct mention of PostgreSQL
+# versions is used.
+.PHONY: postgres-configure-v15
+postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
+.PHONY: postgres-configure-v14
+postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
 
 # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
-.PHONY: postgres-v14-headers
-postgres-v14-headers: postgres-v14-configure
-	+@echo "Installing PostgreSQL v14 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install
-
-.PHONY: postgres-v15-headers
-postgres-v15-headers: postgres-v15-configure
-	+@echo "Installing PostgreSQL v15 headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install
+.PHONY: postgres-headers-%
+postgres-headers-%: postgres-configure-%
+	+@echo "Installing PostgreSQL $* headers"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install
 
 # Compile and install PostgreSQL
-.PHONY: postgres-v14
-postgres-v14: postgres-v14-configure \
-		  postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
-	+@echo "Compiling libpq v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v14"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install
+.PHONY: postgres-%
+postgres-%: postgres-configure-% \
+		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
+	+@echo "Compiling PostgreSQL $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
+	+@echo "Compiling libpq $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
+	+@echo "Compiling pg_buffercache $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 
-.PHONY: postgres-v15
-postgres-v15: postgres-v15-configure \
-		  postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
-	+@echo "Compiling libpq v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect v15"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install
+.PHONY: postgres-clean-%
+postgres-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
 
-# shorthand to build all Postgres versions
-postgres: postgres-v14 postgres-v15
+.PHONY: neon-pg-ext-%
+neon-pg-ext-%: postgres-%
+	+@echo "Compiling neon $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+	+@echo "Compiling neon_walredo $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_test_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
 
-.PHONY: postgres-v14-clean
-postgres-v14-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean
+.PHONY: neon-pg-ext-clean-%
+neon-pg-ext-clean-%:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
 
-.PHONY: postgres-v15-clean
-postgres-v15-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean
-
-neon-pg-ext-v14: postgres-v14
-	+@echo "Compiling neon v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v14"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v14
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
-
-neon-pg-ext-v15: postgres-v15
-	+@echo "Compiling neon v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
-	+@echo "Compiling neon_walredo v15"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
-	+@echo "Compiling neon_test_utils" v15
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
-	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
+.PHONY: neon-pg-ext
+neon-pg-ext: \
+	neon-pg-ext-v14 \
+	neon-pg-ext-v15
 
 .PHONY: neon-pg-ext-clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
-	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
+neon-pg-ext-clean: \
+	neon-pg-ext-clean-v14 \
+	neon-pg-ext-clean-v15
 
-neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15
-postgres-headers: postgres-v14-headers postgres-v15-headers
-postgres-clean: postgres-v14-clean postgres-v15-clean
+# shorthand to build all Postgres versions
+.PHONY: postgres
+postgres: \
+	postgres-v14 \
+	postgres-v15
+
+.PHONY: postgres-headers
+postgres-headers: \
+	postgres-headers-v14 \
+	postgres-headers-v15
+
+.PHONY: postgres-clean
+postgres-clean: \
+	postgres-clean-v14 \
+	postgres-clean-v15
 
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
-clean:
-	cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean
-	cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean
+clean: postgres-clean neon-pg-ext-clean
 	$(CARGO_CMD_PREFIX) cargo clean
-	cd pgxn/neon && $(MAKE) clean
-	cd pgxn/neon_test_utils && $(MAKE) clean
 
 # This removes everything
 .PHONY: distclean

From b58f7710ff4d3c8772dee43e98a7166678706177 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 14 Dec 2022 23:05:19 +0100
Subject: [PATCH 008/132] seqwait: different error messages per variant

Would have been handy to get slightly more details in
https://github.com/neondatabase/neon/issues/3109

refs https://github.com/neondatabase/neon/issues/3109
---
 libs/utils/src/seqwait.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs
index bf330a482c..e3f0b505da 100644
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -11,11 +11,13 @@ use tokio::time::timeout;
 
 /// An error happened while waiting for a number
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
-#[error("SeqWaitError")]
 pub enum SeqWaitError {
     /// The wait timeout was reached
+    #[error("seqwait timeout was reached")]
     Timeout,
+
     /// [`SeqWait::shutdown`] was called
+    #[error("SeqWait::shutdown was called")]
     Shutdown,
 }
 

From 70ce01d84d155b2622d4b0857d20abdcbe7a5b87 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Dec 2022 00:42:30 +0300
Subject: [PATCH 009/132] Deploy broker with L4 LB in new env. (#3125)

Seems to be fixing issue with missing keepalives.
---
 .../ansible/prod.ap-southeast-1.hosts.yaml    |  2 +-
 .github/ansible/prod.eu-central-1.hosts.yaml  |  2 +-
 .github/ansible/prod.us-east-2.hosts.yaml     |  2 +-
 .github/ansible/prod.us-west-2.hosts.yaml     |  2 +-
 .github/ansible/staging.eu-west-1.hosts.yaml  |  2 +-
 .github/ansible/staging.us-east-2.hosts.yaml  |  2 +-
 ...ev-eu-west-1-zeta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...ev-us-east-2-beta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...utheast-1-epsilon.neon-storage-broker.yaml | 33 ++++++++-----------
 ...u-central-1-gamma.neon-storage-broker.yaml | 33 ++++++++-----------
 ...d-us-east-2-delta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...rod-us-west-2-eta.neon-storage-broker.yaml | 33 ++++++++-----------
 .github/workflows/build_and_test.yml          |  4 +--
 13 files changed, 92 insertions(+), 122 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index bcc7bb3b16..648029c120 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 2b372d0fcb..c285a9f3b6 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 7a4002ec88..1753068b8c 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 682ee5994d..7d6e49bf9c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.eta.us-west-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index 90f00175b0..cfcc3a9ae8 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index d2b7fae12a..78a4582e57 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index e876367a18..c6e682f571 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index dcf4b99de2..c7682d24c0 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.beta.us-east-2.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index 0abc6ebaa1..92b1777d0b 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index d44a3eab5c..f89df4533a 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index b9eeff5681..8cbc1af7cf 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.delta.us-east-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index 249f76303a..8a7488948d 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.eta.us-west-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7a887cbece..43b855a2b0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1072,7 +1072,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1149,7 +1149,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]

From 6dec85b19d63fd18b7f64d65d131fae6842f39f0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 15 Dec 2022 12:39:46 +0200
Subject: [PATCH 010/132] Redefine the timeline_gc API to not perform a forced
 compaction

Previously, the /v1/tenant/:tenant_id/timeline/:timeline_id/do_gc API
call performed a flush and compaction on the timeline before
GC. Change it not to do that, and change all the tests that used that
API to perform compaction explicitly.

The compaction happens at a slightly different point now. Previously,
the code performed the `refresh_gc_info_internal` step first, and only
then did compaction on all the timelines. I don't think that was what
was originally intended here. Presumably the idea with compaction was
to make some old layer files available for GC. But if we're going to
flush the current in-memory layer to disk, surely you would want to
include the newly-written layer in the compaction too. I guess this
didn't make any difference to the tests in practice, but in any case,
the tests now perform the flush and compaction before any of the GC
steps.

Some of the tests might not need the compaction at all, but I didn't
try hard to determine which ones might need it. I left it out from a
few tests that intentionally tested calling do_gc with an invalid
tenant or timeline ID, though.
---
 pageserver/src/tenant.rs                    | 24 +++++----------------
 pageserver/src/tenant_mgr.rs                |  2 +-
 pageserver/src/tenant_tasks.rs              |  2 +-
 test_runner/fixtures/compare_fixtures.py    |  1 +
 test_runner/regress/test_branch_and_gc.py   |  2 ++
 test_runner/regress/test_branch_behind.py   |  1 +
 test_runner/regress/test_gc_aggressive.py   |  9 ++++----
 test_runner/regress/test_import.py          |  1 +
 test_runner/regress/test_old_request_lsn.py |  1 +
 test_runner/regress/test_pitr_gc.py         |  1 +
 test_runner/regress/test_tenant_detach.py   |  1 +
 test_runner/regress/test_timeline_size.py   |  1 -
 12 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4fcb1e3ba3..0e59b43dda 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1164,7 +1164,6 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
-        checkpoint_before_gc: bool,
     ) -> anyhow::Result<GcResult> {
         anyhow::ensure!(
             self.is_active(),
@@ -1179,7 +1178,7 @@ impl Tenant {
             let _timer = STORAGE_TIME
                 .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
                 .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
+            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
                 .await
         }
     }
@@ -1778,7 +1777,6 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
-        checkpoint_before_gc: bool,
     ) -> anyhow::Result<GcResult> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
@@ -1805,18 +1803,6 @@ impl Tenant {
                 // made.
                 break;
             }
-
-            // If requested, force flush all in-memory layers to disk first,
-            // so that they too can be garbage collected. That's
-            // used in tests, so we want as deterministic results as possible.
-            if checkpoint_before_gc {
-                timeline.checkpoint(CheckpointConfig::Forced).await?;
-                info!(
-                    "timeline {} checkpoint_before_gc done",
-                    timeline.timeline_id
-                );
-            }
-
             let result = timeline.gc().await?;
             totals += result;
         }
@@ -2877,7 +2863,7 @@ mod tests {
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
@@ -2933,7 +2919,7 @@ mod tests {
         let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
+        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
         let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
         assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
         match tline.get(*TEST_KEY, Lsn(0x25)) {
@@ -2960,7 +2946,7 @@ mod tests {
             .expect("Should have a local timeline");
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
         assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
 
@@ -2985,7 +2971,7 @@ mod tests {
 
         // run gc on parent
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
 
         // Check that the data is still accessible on the branch.
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index f4f1eba717..615dcce4a1 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -496,7 +496,7 @@ pub async fn immediate_gc(
         async move {
             fail::fail_point!("immediate_gc_task_pre");
             let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
                 .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs
index d3aec933c2..d71f244725 100644
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) {
             } else {
                 // Run gc
                 if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
                     {
                         sleep_duration = wait_duration;
                         error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 291f924379..530e5afaab 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -115,6 +115,7 @@ class NeonCompare(PgCompare):
         return self._pg_bin
 
     def flush(self):
+        self.pageserver_http.timeline_checkpoint(self.env.initial_tenant, self.timeline)
         self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0)
 
     def compact(self):
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index dfbf956568..cc807b7ff3 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -84,6 +84,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
 
     # Set the GC horizon so that lsn1 is inside the horizon, which means
     # we can create a new branch starting from lsn1.
+    pageserver_http_client.timeline_checkpoint(tenant, timeline_main)
     pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024)
 
     env.neon_cli.create_branch(
@@ -156,6 +157,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     # branch creation task but the individual timeline GC iteration happens *after*
     # the branch creation task.
     pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)"))
+    pageserver_http_client.timeline_checkpoint(tenant, b0)
 
     def do_gc():
         pageserver_http_client.timeline_gc(tenant, b0, 0)
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index a841e3ced2..d19f6a7d39 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -109,6 +109,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
 
     # check that we cannot create branch based on garbage collected data
     with env.pageserver.http_client() as pageserver_http:
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
         print_gc_result(gc_result)
 
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 332bef225f..92855899f0 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -35,12 +35,13 @@ async def gc(env: NeonEnv, timeline: TimelineId):
 
     loop = asyncio.get_running_loop()
 
+    def do_gc():
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
+        pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
+
     with concurrent.futures.ThreadPoolExecutor() as pool:
         while updates_performed < updates_to_perform:
-            await loop.run_in_executor(
-                pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
-            )
-
+            await loop.run_in_executor(pool, do_gc)
 
 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 1a99d13a0b..fb1bc4839e 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -306,6 +306,7 @@ def _import(
 
     # Check that gc works
     pageserver_http = env.pageserver.http_client()
+    pageserver_http.timeline_checkpoint(tenant, timeline)
     pageserver_http.timeline_gc(tenant, timeline, 0)
 
     return tar_output_file
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 1e81d8ba60..9885a811e1 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -59,6 +59,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # Make a lot of updates on a single row, generating a lot of WAL. Trigger
     # garbage collections so that the page server will remove old page versions.
     for i in range(10):
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
         print_gc_result(gc_result)
 
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index d8b7256577..fe4fbc0927 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -52,6 +52,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
 
     # run GC
     with env.pageserver.http_client() as pageserver_http:
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
         pageserver_http.timeline_compact(env.initial_tenant, timeline)
         # perform aggressive GC. Data still should be kept because of the PITR setting.
         gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 59811c565c..ce1e334bfa 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -24,6 +24,7 @@ def do_gc_target(
     """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
     try:
         log.info("sending gc http request")
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
         pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
     except Exception as e:
         log.error("do_gc failed: %s", e)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index cef1f365cd..4b70c2ea18 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -326,7 +326,6 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
 
     wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
-
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
     assert_physical_size(env, env.initial_tenant, new_timeline_id)

From c262390214109d46be2c230f1d52948e7134f76d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 15 Dec 2022 12:39:50 +0200
Subject: [PATCH 011/132] Don't upload index file when GC doesn't remove
 anything.

I saw an excessive number of index file upload operations in
production, even when nothing on the timeline changes. It was because
our GC schedules index file upload if the GC cutoff LSN is advanced,
even if the GC had nothing else to do. The GC cutoff LSN marches
steadily forwards, even when there is no user activity on the
timeline, when the cutoff is determined by the time-based PITR
interval setting. To dial that down, only schedule index file upload
when GC is about to actually remove something.
---
 pageserver/src/tenant/timeline.rs         | 51 +++++++-------
 test_runner/regress/test_gc_aggressive.py | 86 ++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a746fd9bf8..cc6583dcf6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2487,9 +2487,6 @@ impl Timeline {
             );
             write_guard.store_and_unlock(new_gc_cutoff).wait();
         }
-        // Persist the new GC cutoff value in the metadata file, before
-        // we actually remove anything.
-        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
 
         info!("GC starting");
 
@@ -2600,19 +2597,33 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
-        // Actually delete the layers from disk and remove them from the map.
-        // (couldn't do this in the loop above, because you cannot modify a collection
-        // while iterating it. BTreeMap::retain() would be another option)
-        let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
-        for doomed_layer in layers_to_remove {
-            let path = doomed_layer.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
-            layer_names_to_delete.push(doomed_layer.filename());
-            doomed_layer.delete()?;
-            layers.remove_historic(doomed_layer);
-            result.layers_removed += 1;
+        if !layers_to_remove.is_empty() {
+            // Persist the new GC cutoff value in the metadata file, before
+            // we actually remove anything.
+            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+
+            // Actually delete the layers from disk and remove them from the map.
+            // (couldn't do this in the loop above, because you cannot modify a collection
+            // while iterating it. BTreeMap::retain() would be another option)
+            let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
+            for doomed_layer in layers_to_remove {
+                let path = doomed_layer.local_path();
+                self.metrics
+                    .current_physical_size_gauge
+                    .sub(path.metadata()?.len());
+                layer_names_to_delete.push(doomed_layer.filename());
+                doomed_layer.delete()?;
+                layers.remove_historic(doomed_layer);
+                result.layers_removed += 1;
+            }
+
+            if result.layers_removed != 0 {
+                fail_point!("after-timeline-gc-removed-layers");
+            }
+
+            if let Some(remote_client) = &self.remote_client {
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
+            }
         }
 
         info!(
@@ -2620,14 +2631,6 @@ impl Timeline {
             result.layers_removed, new_gc_cutoff
         );
 
-        if result.layers_removed != 0 {
-            fail_point!("after-timeline-gc-removed-layers");
-        }
-
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
-        }
-
         result.elapsed = now.elapsed()?;
         Ok(result)
     }
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 92855899f0..b9d012fa36 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,9 +2,17 @@ import asyncio
 import concurrent.futures
 import random
 
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres
-from fixtures.types import TimelineId
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    Postgres,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
 from fixtures.utils import query_scalar
 
 # Test configuration
@@ -43,6 +51,7 @@ async def gc(env: NeonEnv, timeline: TimelineId):
         while updates_performed < updates_to_perform:
             await loop.run_in_executor(pool, do_gc)
 
+
 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId):
     workers = []
@@ -88,3 +97,76 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
         r = cur.fetchone()
         assert r is not None
         assert r == (num_rows, updates_to_perform)
+
+
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
+
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_gc_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_gc_index_upload", "main")
+    pg = env.postgres.create_start("test_gc_index_upload")
+
+    pageserver_http = env.pageserver.http_client()
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # Sanity check that the metric works
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+    before = get_num_remote_ops("index", "upload")
+    assert before > 0
+
+    # Run many cycles of GC. Then check that the number of index files
+    # uploads didn't grow much. In particular we don't want to re-upload the
+    # index file on every GC iteration, when it has no work to do.
+    #
+    # On each iteration, we use a slightly smaller GC horizon, so that the GC
+    # at least needs to check if it has work to do.
+    for i in range(100):
+        cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
+        num_index_uploads = get_num_remote_ops("index", "upload")
+        log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
+
+    after = num_index_uploads
+    log.info(f"{after-before} new index uploads during test")
+    assert after - before < 5

From e14bbb889a24c06ea34314c1c864eec2c10c0d4e Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Dec 2022 12:55:12 +0300
Subject: [PATCH 012/132] Enable broker client keepalives. (#3127)

Should fix stale connections.

ref https://github.com/neondatabase/neon/issues/3108
---
 pageserver/src/config.rs                 | 19 ++++++++++++
 pageserver/src/walreceiver.rs            | 11 ++++---
 safekeeper/src/bin/safekeeper.rs         |  4 +++
 safekeeper/src/broker.rs                 |  2 +-
 safekeeper/src/lib.rs                    |  2 ++
 storage_broker/benches/rps.rs            |  6 ++--
 storage_broker/src/bin/storage_broker.rs | 38 ++++++++++++++----------
 storage_broker/src/lib.rs                |  9 +++++-
 8 files changed, 66 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 48e9f32276..9971ddc0f7 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -137,6 +137,7 @@ pub struct PageServerConf {
 
     /// Storage broker endpoints to connect to.
     pub broker_endpoint: Uri,
+    pub broker_keepalive_interval: Duration,
 
     pub log_format: LogFormat,
 
@@ -215,6 +216,7 @@ struct PageServerConfigBuilder {
 
     profiling: BuilderValue<ProfilingConfig>,
     broker_endpoint: BuilderValue<Uri>,
+    broker_keepalive_interval: BuilderValue<Duration>,
 
     log_format: BuilderValue<LogFormat>,
 
@@ -247,6 +249,10 @@ impl Default for PageServerConfigBuilder {
             broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint")),
+            broker_keepalive_interval: Set(humantime::parse_duration(
+                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
+            )
+            .expect("cannot parse default keepalive interval")),
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
@@ -310,6 +316,10 @@ impl PageServerConfigBuilder {
         self.broker_endpoint = BuilderValue::Set(broker_endpoint)
     }
 
+    pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
+        self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
+    }
+
     pub fn id(&mut self, node_id: NodeId) {
         self.id = BuilderValue::Set(node_id)
     }
@@ -365,6 +375,9 @@ impl PageServerConfigBuilder {
             broker_endpoint: self
                 .broker_endpoint
                 .ok_or(anyhow!("No broker endpoints provided"))?,
+            broker_keepalive_interval: self
+                .broker_keepalive_interval
+                .ok_or(anyhow!("No broker keepalive interval provided"))?,
             log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
             concurrent_tenant_size_logical_size_queries: self
                 .concurrent_tenant_size_logical_size_queries
@@ -532,6 +545,7 @@ impl PageServerConf {
                 "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                 "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
+                "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
                     LogFormat::from_config(&parse_toml_string(key, item)?)?
                 ),
@@ -659,6 +673,7 @@ impl PageServerConf {
             profiling: ProfilingConfig::Disabled,
             default_tenant_conf: TenantConf::dummy_conf(),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+            broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
         }
@@ -829,6 +844,9 @@ log_format = 'json'
                 profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: humantime::parse_duration(
+                    storage_broker::DEFAULT_KEEPALIVE_INTERVAL
+                )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             },
@@ -872,6 +890,7 @@ log_format = 'json'
                 profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
+                broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
             },
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs
index 74ede7c213..aaf46579a7 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -44,10 +44,13 @@ pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result
     let broker_endpoint = conf.broker_endpoint.clone();
 
     // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client = storage_broker::connect(broker_endpoint.clone()).context(format!(
-        "Failed to create broker client to {}",
-        &conf.broker_endpoint
-    ))?;
+    let broker_client =
+        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+            format!(
+                "Failed to create broker client to {}",
+                &conf.broker_endpoint
+            ),
+        )?;
 
     if BROKER_CLIENT.set(broker_client).is_err() {
         panic!("broker already initialized");
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index cab5053b5b..275253d1d4 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -82,6 +82,9 @@ struct Args {
     /// established; plaintext otherwise.
     #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)]
     broker_endpoint: Uri,
+    /// Broker keepalive interval.
+    #[arg(long, value_parser= humantime::parse_duration, default_value = storage_broker::DEFAULT_KEEPALIVE_INTERVAL)]
+    broker_keepalive_interval: Duration,
     /// Peer safekeeper is considered dead after not receiving heartbeats from
     /// it during this period passed as a human readable duration.
     #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)]
@@ -142,6 +145,7 @@ fn main() -> anyhow::Result<()> {
         listen_http_addr: args.listen_http,
         no_sync: args.no_sync,
         broker_endpoint: args.broker_endpoint,
+        broker_keepalive_interval: args.broker_keepalive_interval,
         heartbeat_timeout: args.heartbeat_timeout,
         remote_storage: args.remote_storage,
         max_offloader_lag_bytes: args.max_offloader_lag,
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index df2dc92efe..92f35bf51f 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -66,7 +66,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
 /// Subscribe and fetch all the interesting data from the broker.
 async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
-    let mut client = storage_broker::connect(conf.broker_endpoint)?;
+    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
     // TODO: subscribe only to local timelines instead of all
     let request = SubscribeSafekeeperInfoRequest {
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 60a1911068..5decfe64de 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -51,6 +51,7 @@ pub struct SafeKeeperConf {
     pub listen_http_addr: String,
     pub no_sync: bool,
     pub broker_endpoint: Uri,
+    pub broker_keepalive_interval: Duration,
     pub heartbeat_timeout: Duration,
     pub remote_storage: Option<RemoteStorageConfig>,
     pub max_offloader_lag_bytes: u64,
@@ -83,6 +84,7 @@ impl SafeKeeperConf {
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint"),
+            broker_keepalive_interval: Duration::from_secs(5),
             backup_runtime_threads: None,
             wal_backup_enabled: true,
             auth_validation_public_key_path: None,
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 73141318b8..1262bd9333 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -88,7 +88,7 @@ fn tli_from_u64(i: u64) -> Vec<u8> {
 async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>, i: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
 
     let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
@@ -112,7 +112,7 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
 async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
     let mut client = match client {
         Some(c) => c,
-        None => storage_broker::connect(DEFAULT_ENDPOINT).unwrap(),
+        None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
     let mut counter: u64 = 0;
 
@@ -152,7 +152,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     let h = tokio::spawn(progress_reporter(counters.clone()));
 
-    let c = storage_broker::connect(DEFAULT_ENDPOINT).unwrap();
+    let c = storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap();
 
     for i in 0..args.num_subs {
         let c = Some(c.clone());
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index fdf2637b4d..6d80e96bf1 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -39,7 +39,9 @@ use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
 use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
 use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
-use storage_broker::{parse_proto_ttid, EitherBody, DEFAULT_LISTEN_ADDR};
+use storage_broker::{
+    parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
+};
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
@@ -47,8 +49,8 @@ use utils::sentry_init::{init_sentry, release_name};
 
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_CHAN_SIZE: usize = 128;
-const DEFAULT_HTTP2_KEEPALIVE_INTERVAL: &str = "5000ms";
+const DEFAULT_CHAN_SIZE: usize = 32;
+const DEFAULT_ALL_KEYS_CHAN_SIZE: usize = 16384;
 
 #[derive(Parser, Debug)]
 #[command(version = GIT_VERSION, about = "Broker for neon storage nodes communication", long_about = None)]
@@ -56,11 +58,14 @@ struct Args {
     /// Endpoint to listen on.
     #[arg(short, long, default_value = DEFAULT_LISTEN_ADDR)]
     listen_addr: SocketAddr,
-    /// Size of the queue to the subscriber.
+    /// Size of the queue to the per timeline subscriber.
     #[arg(long, default_value_t = DEFAULT_CHAN_SIZE)]
-    chan_size: usize,
+    timeline_chan_size: usize,
+    /// Size of the queue to the all keys subscriber.
+    #[arg(long, default_value_t = DEFAULT_ALL_KEYS_CHAN_SIZE)]
+    all_keys_chan_size: usize,
     /// HTTP/2 keepalive interval.
-    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HTTP2_KEEPALIVE_INTERVAL)]
+    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_KEEPALIVE_INTERVAL)]
     http2_keepalive_interval: Duration,
     /// Format for logging, either 'plain' or 'json'.
     #[arg(long, default_value = "plain")]
@@ -108,7 +113,7 @@ struct SharedState {
 }
 
 impl SharedState {
-    pub fn new(chan_size: usize) -> Self {
+    pub fn new(all_keys_chan_size: usize) -> Self {
         SharedState {
             next_pub_id: 0,
             num_pubs: 0,
@@ -116,7 +121,7 @@ impl SharedState {
             num_subs_to_timelines: 0,
             chans_to_timeline_subs: HashMap::new(),
             num_subs_to_all: 0,
-            chan_to_all_subs: broadcast::channel(chan_size).0,
+            chan_to_all_subs: broadcast::channel(all_keys_chan_size).0,
         }
     }
 
@@ -139,7 +144,7 @@ impl SharedState {
     pub fn register_subscriber(
         &mut self,
         sub_key: SubscriptionKey,
-        chan_size: usize,
+        timeline_chan_size: usize,
     ) -> (SubId, broadcast::Receiver<SafekeeperTimelineInfo>) {
         let sub_id = self.next_sub_id;
         self.next_sub_id += 1;
@@ -158,7 +163,7 @@ impl SharedState {
                     self.chans_to_timeline_subs
                         .entry(ttid)
                         .or_insert(ChanToTimelineSub {
-                            chan: broadcast::channel(chan_size).0,
+                            chan: broadcast::channel(timeline_chan_size).0,
                             num_subscribers: 0,
                         });
                 chan_to_timeline_sub.num_subscribers += 1;
@@ -200,7 +205,7 @@ impl SharedState {
 #[derive(Clone)]
 struct Registry {
     shared_state: Arc<RwLock<SharedState>>,
-    chan_size: usize,
+    timeline_chan_size: usize,
 }
 
 impl Registry {
@@ -232,7 +237,7 @@ impl Registry {
         let (sub_id, sub_rx) = self
             .shared_state
             .write()
-            .register_subscriber(sub_key, self.chan_size);
+            .register_subscriber(sub_key, self.timeline_chan_size);
         info!(
             "subscription started id={}, key={:?}, addr={:?}",
             sub_id, sub_key, remote_addr
@@ -369,7 +374,8 @@ impl BrokerService for Broker {
                     Err(RecvError::Lagged(skipped_msg)) => {
                         missed_msgs += skipped_msg;
                         if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) {
-                            warn!("dropped {} messages, channel is full", missed_msgs);
+                            warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
                             missed_msgs = 0;
                         }
                     }
@@ -427,8 +433,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     info!("version: {GIT_VERSION}");
 
     let registry = Registry {
-        shared_state: Arc::new(RwLock::new(SharedState::new(args.chan_size))),
-        chan_size: args.chan_size,
+        shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))),
+        timeline_chan_size: args.timeline_chan_size,
     };
     let storage_broker_impl = Broker {
         registry: registry.clone(),
@@ -522,7 +528,7 @@ mod tests {
     async fn test_registry() {
         let registry = Registry {
             shared_state: Arc::new(RwLock::new(SharedState::new(16))),
-            chan_size: 16,
+            timeline_chan_size: 16,
         };
 
         // subscribe to timeline 2
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 0629caa2fb..d12a79a69f 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -1,6 +1,7 @@
 use hyper::body::HttpBody;
 use std::pin::Pin;
 use std::task::{Context, Poll};
+use std::time::Duration;
 use tonic::codegen::StdError;
 use tonic::transport::{ClientTlsConfig, Endpoint};
 use tonic::{transport::Channel, Code, Status};
@@ -26,6 +27,8 @@ pub use hyper::Uri;
 pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
 pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
 
+pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
+
 // BrokerServiceClient charged with tonic provided Channel transport; helps to
 // avoid depending on tonic directly in user crates.
 pub type BrokerClientChannel = BrokerServiceClient<Channel>;
@@ -33,7 +36,7 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
 // validated here.
-pub fn connect<U>(endpoint: U) -> anyhow::Result<BrokerClientChannel>
+pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
 where
     U: std::convert::TryInto<Uri>,
     U::Error: std::error::Error + Send + Sync + 'static,
@@ -46,6 +49,10 @@ where
         let tls = ClientTlsConfig::new();
         tonic_endpoint = tonic_endpoint.tls_config(tls)?;
     }
+    tonic_endpoint = tonic_endpoint
+        .http2_keep_alive_interval(keepalive_interval)
+        .keep_alive_while_idle(true);
+    //  keep_alive_timeout is 20s by default on both client and server side
     let channel = tonic_endpoint.connect_lazy();
     Ok(BrokerClientChannel::new(channel))
 }

From b688a538e3ff843f2acf1b33948aa519b5477ce4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Dec 2022 13:40:01 +0200
Subject: [PATCH 013/132] fix(remote_storage): use cached credentials (#3128)

IMDSv2 has limits, and if we query it on every s3 interaction we are
going to go over those limits. Changes the s3_bucket client
configuration to use:
- ChainCredentialsProvider to handle env variables or imds usage
- LazyCachingCredentialsProvider to actually cache any credentials

Related: https://github.com/awslabs/aws-sdk-rust/issues/629
Possibly related: https://github.com/neondatabase/neon/issues/3118
---
 libs/remote_storage/src/s3_bucket.rs | 47 +++++++++++-----------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ab1e5da6c5..740f3753d8 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,14 +4,13 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::env::var;
 use std::sync::Arc;
-use std::time::Duration;
 
 use anyhow::Context;
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider, imds,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
 };
 use aws_sdk_s3::{
     config::Config,
@@ -20,7 +19,6 @@ use aws_sdk_s3::{
     Client, Endpoint, Region,
 };
 use aws_smithy_http::body::SdkBody;
-use aws_types::credentials::{CredentialsError, ProvideCredentials};
 use hyper::Body;
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
@@ -31,8 +29,6 @@ use crate::{
     Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub(super) mod metrics {
     use metrics::{register_int_counter_vec, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -122,30 +118,23 @@ impl S3Bucket {
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
         );
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            // uses imds v2
+            let imds = ImdsCredentialsProvider::builder().build();
+
+            // finally add caching.
+            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
+            LazyCachingCredentialsProvider::builder()
+                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
+                .build()
+        };
+
         let mut config_builder = Config::builder()
             .region(Region::new(aws_config.bucket_region.clone()))
-            .credentials_provider(provide_credentials_fn(|| async {
-                match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
-                    true => {
-                        EnvironmentVariableCredentialsProvider::new()
-                            .provide_credentials()
-                            .await
-                    }
-                    false => {
-                        let imds_client = imds::Client::builder()
-                            .connect_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .read_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .build()
-                            .await
-                            .map_err(CredentialsError::unhandled)?;
-                        ImdsCredentialsProvider::builder()
-                            .imds_client(imds_client)
-                            .build()
-                            .provide_credentials()
-                            .await
-                    }
-                }
-            }));
+            .credentials_provider(credentials_provider);
 
         if let Some(custom_endpoint) = aws_config.endpoint.clone() {
             let endpoint = Endpoint::immutable(

From 8d39fcdf728d9929cee5416371bf10b997ca5e2a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 16 Dec 2022 13:23:36 +0000
Subject: [PATCH 014/132] pgbench-compare: don't run neon-captest-new (#3130)

Do not run Nightly Benchmarks on `neon-captest-new`.
This is a temporary solution to avoid spikes in the storage we consume
during the test run. To collect data for the default instance, we could
run tests weekly (i.e. not daily).
---
 .github/workflows/benchmarking.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index e3e0f1e820..07e111b67c 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -18,6 +18,7 @@ on:
       region_id:
         description: 'Use a particular region. If not set the default region will be used'
         required: false
+        default: 'aws-us-east-2'
       save_perf_report:
         type: boolean
         description: 'Publish perf report or not. If not set, the report is published only for the main branch'
@@ -115,13 +116,10 @@ jobs:
         # neon-captest-prefetch: Same, with prefetching enabled (new project)
         # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
         # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
+        platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
         db_size: [ 10gb ]
         runner: [ us-east-2 ]
         include:
-          - platform: neon-captest-new
-            db_size: 50gb
-            runner: us-east-2
           - platform: neon-captest-prefetch
             db_size: 50gb
             runner: us-east-2

From c86c0c08ef769269c4b827967525a35880d14413 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Dec 2022 17:19:47 +0200
Subject: [PATCH 015/132] task_mgr: use CancellationToken instead of
 shutdown_rx (#3124)

this should help us in the future to have more freedom with spawning
tasks and cancelling things, most importantly blocking tasks (assuming
the CancellationToken::is_cancelled is performant enough).
CancellationToken allows creation of hierarchical cancellations, which
would also simplify the task_mgr shutdown operation, rendering it
unnecessary.
---
 pageserver/src/task_mgr.rs | 55 +++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 3325ce01d4..91719fb3af 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -25,7 +25,6 @@
 //! the current task has been requested to shut down. You can use that with
 //! Tokio select!().
 //!
-//!
 //! TODO: This would be a good place to also handle panics in a somewhat sane way.
 //! Depending on what task panics, we might want to kill the whole server, or
 //! only a single tenant or timeline.
@@ -43,9 +42,9 @@ use std::sync::{Arc, Mutex};
 
 use futures::FutureExt;
 use tokio::runtime::Runtime;
-use tokio::sync::watch;
 use tokio::task::JoinHandle;
 use tokio::task_local;
+use tokio_util::sync::CancellationToken;
 
 use tracing::{debug, error, info, warn};
 
@@ -146,11 +145,10 @@ static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
     Lazy::new(|| Mutex::new(HashMap::new()));
 
 task_local! {
-    // There is a Tokio watch channel for each task, which can be used to signal the
-    // task that it needs to shut down. This task local variable holds the receiving
-    // end of the channel. The sender is kept in the global registry, so that anyone
-    // can send the signal to request task shutdown.
-    static SHUTDOWN_RX: watch::Receiver<bool>;
+    // This is a cancellation token which will be cancelled when a task needs to shut down. The
+    // root token is kept in the global registry, so that anyone can send the signal to request
+    // task shutdown.
+    static SHUTDOWN_TOKEN: CancellationToken;
 
     // Each task holds reference to its own PageServerTask here.
     static CURRENT_TASK: Arc<PageServerTask>;
@@ -226,8 +224,8 @@ struct PageServerTask {
 
     name: String,
 
-    // To request task shutdown, send 'true' to the channel to notify the task.
-    shutdown_tx: watch::Sender<bool>,
+    // To request task shutdown, just cancel this token.
+    cancel: CancellationToken,
 
     mutable: Mutex<MutableTaskState>,
 }
@@ -247,13 +245,13 @@ pub fn spawn<F>(
 where
     F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
-    let (shutdown_tx, shutdown_rx) = watch::channel(false);
+    let cancel = CancellationToken::new();
     let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
     let task = Arc::new(PageServerTask {
         task_id: PageserverTaskId(task_id),
         kind,
         name: name.to_string(),
-        shutdown_tx,
+        cancel: cancel.clone(),
         mutable: Mutex::new(MutableTaskState {
             tenant_id,
             timeline_id,
@@ -271,7 +269,7 @@ where
         task_name,
         task_id,
         task_cloned,
-        shutdown_rx,
+        cancel,
         shutdown_process_on_error,
         future,
     ));
@@ -288,7 +286,7 @@ async fn task_wrapper<F>(
     task_name: String,
     task_id: u64,
     task: Arc<PageServerTask>,
-    shutdown_rx: watch::Receiver<bool>,
+    shutdown_token: CancellationToken,
     shutdown_process_on_error: bool,
     future: F,
 ) where
@@ -296,9 +294,9 @@ async fn task_wrapper<F>(
 {
     debug!("Starting task '{}'", task_name);
 
-    let result = SHUTDOWN_RX
+    let result = SHUTDOWN_TOKEN
         .scope(
-            shutdown_rx,
+            shutdown_token,
             CURRENT_TASK.scope(task, {
                 // We use AssertUnwindSafe here so that the payload function
                 // doesn't need to be UnwindSafe. We don't do anything after the
@@ -408,7 +406,7 @@ pub async fn shutdown_tasks(
                 && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
                 && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
             {
-                let _ = task.shutdown_tx.send_replace(true);
+                task.cancel.cancel();
                 victim_tasks.push(Arc::clone(task));
             }
         }
@@ -439,21 +437,28 @@ pub fn current_task_kind() -> Option<TaskKind> {
 /// A Future that can be used to check if the current task has been requested to
 /// shut down.
 pub async fn shutdown_watcher() {
-    let mut shutdown_rx = SHUTDOWN_RX
-        .try_with(|rx| rx.clone())
+    let token = SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
         .expect("shutdown_requested() called in an unexpected task or thread");
 
-    while !*shutdown_rx.borrow() {
-        if shutdown_rx.changed().await.is_err() {
-            break;
-        }
-    }
+    token.cancelled().await;
+}
+
+/// Clone the current task's cancellation token, which can be moved across tasks.
+///
+/// When the task which is currently executing is shutdown, the cancellation token will be
+/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
+/// `tokio::task::JoinSet::spawn`.
+pub fn shutdown_token() -> CancellationToken {
+    SHUTDOWN_TOKEN
+        .try_with(|t| t.clone())
+        .expect("shutdown_token() called in an unexpected task or thread")
 }
 
 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
-        *shutdown_rx.borrow()
+    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
+        cancel.is_cancelled()
     } else {
         if !cfg!(test) {
             warn!("is_shutdown_requested() called in an unexpected task or thread");

From 64775a0a756c693b23c84f54fddbdcce5b1d5f3c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 16 Dec 2022 17:45:38 +0000
Subject: [PATCH 016/132] test_runner/performance: fix flush for NeonCompare
 (#3135)

Fix performance tests:
```
AttributeError: 'NeonCompare' object has no attribute 'pageserver_http'
```
---
 test_runner/fixtures/compare_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 530e5afaab..fa488c4446 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -115,7 +115,7 @@ class NeonCompare(PgCompare):
         return self._pg_bin
 
     def flush(self):
-        self.pageserver_http.timeline_checkpoint(self.env.initial_tenant, self.timeline)
+        self.pageserver_http_client.timeline_checkpoint(self.env.initial_tenant, self.timeline)
         self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0)
 
     def compact(self):

From 83baf49487213c2e1f03ea5d6b3f71c8b3c9f49d Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Thu, 15 Dec 2022 18:10:43 +0300
Subject: [PATCH 017/132] [proxy] Forward compute connection params to client

This fixes all kinds of problems related to missing params,
like broken timestamps (due to `integer_datetimes`).

This solution is not ideal, but it will help. Meanwhile,
I'm going to dedicate some time to improving connection machinery.

Note that this **does not** fix problems with passing certain parameters
in a reverse direction, i.e. **from client to compute**. This is a
separate matter and will be dealt with in an upcoming PR.
---
 Cargo.lock                               |  8 +--
 Cargo.toml                               |  2 +-
 compute_tools/Cargo.toml                 |  4 +-
 control_plane/Cargo.toml                 |  2 +-
 libs/postgres_connection/Cargo.toml      |  4 +-
 libs/postgres_ffi/Cargo.toml             |  2 +-
 libs/postgres_ffi/wal_craft/Cargo.toml   |  2 +-
 libs/pq_proto/Cargo.toml                 |  2 +-
 libs/pq_proto/src/lib.rs                 | 64 +++++++++++++-----------
 libs/utils/src/postgres_backend.rs       | 10 ++--
 libs/utils/src/postgres_backend_async.rs | 10 ++--
 pageserver/Cargo.toml                    |  8 +--
 proxy/Cargo.toml                         |  2 +-
 proxy/src/auth/backend/link.rs           |  4 +-
 proxy/src/compute.rs                     | 52 ++++++++++---------
 proxy/src/proxy.rs                       | 18 ++++---
 proxy/src/proxy/tests.rs                 |  2 +-
 safekeeper/Cargo.toml                    |  6 +--
 test_runner/regress/test_proxy.py        | 30 +++++++++++
 19 files changed, 137 insertions(+), 95 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 913b39da0f..1eb27fb0f9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2613,7 +2613,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.2"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2626,7 +2626,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "base64",
  "byteorder",
@@ -2644,7 +2644,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.3"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4010,7 +4010,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 2f73215d3f..0e098d91ee 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,4 +86,4 @@ lto = true
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 [patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index a35cef197d..6240073cb3 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -12,12 +12,12 @@ futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
 notify = "5.0.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 regex = "1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 00b34aafb1..9d9d6a5f11 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,7 +10,7 @@ comfy-table = "6.1"
 git-version = "0.3.5"
 nix = "0.25"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 regex = "1"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 314f3c6f1c..25db64337d 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -8,8 +8,8 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0"
 itertools = "0.10.3"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 01ff6ab60e..bafc587e80 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [dev-dependencies]
 env_logger = "0.9"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 wal_craft = { path = "wal_craft" }
 
 [build-dependencies]
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 4c35c5a650..3a22e9d789 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -11,7 +11,7 @@ clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 postgres_ffi = { path = "../" }
 tempfile = "3.2"
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 4d48e431b4..dc38abd64b 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 anyhow = "1.0"
 bytes = "1.0.1"
 pin-project-lite = "0.2.7"
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 2e311dd6e3..0d698127b9 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -463,7 +463,10 @@ pub enum BeMessage<'a> {
     EncryptionResponse(bool),
     NoData,
     ParameterDescription,
-    ParameterStatus(BeParameterStatusMessage<'a>),
+    ParameterStatus {
+        name: &'a [u8],
+        value: &'a [u8],
+    },
     ParseComplete,
     ReadyForQuery,
     RowDescription(&'a [RowDescriptor<'a>]),
@@ -472,6 +475,28 @@ pub enum BeMessage<'a> {
     KeepAlive(WalSndKeepAlive),
 }
 
+/// Common shorthands.
+impl<'a> BeMessage<'a> {
+    /// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8.
+    /// This is a sensible default, given that:
+    ///  * rust strings only support this encoding out of the box.
+    ///  * tokio-postgres, postgres-jdbc (and probably more) mandate it.
+    ///
+    /// TODO: do we need to report `server_encoding` as well?
+    pub const CLIENT_ENCODING: Self = Self::ParameterStatus {
+        name: b"client_encoding",
+        value: b"UTF8",
+    };
+
+    /// Build a [`BeMessage::ParameterStatus`] holding the server version.
+    pub fn server_version(version: &'a str) -> Self {
+        Self::ParameterStatus {
+            name: b"server_version",
+            value: version.as_bytes(),
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum BeAuthenticationSaslMessage<'a> {
     Methods(&'a [&'a str]),
@@ -485,12 +510,6 @@ pub enum BeParameterStatusMessage<'a> {
     ServerVersion(&'a str),
 }
 
-impl BeParameterStatusMessage<'static> {
-    pub fn encoding() -> BeMessage<'static> {
-        BeMessage::ParameterStatus(Self::Encoding("UTF8"))
-    }
-}
-
 // One row description in RowDescription packet.
 #[derive(Debug)]
 pub struct RowDescriptor<'a> {
@@ -587,14 +606,15 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }
 
 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
-    if s.contains(&0) {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> {
+    let bytes = s.as_ref();
+    if bytes.contains(&0) {
         return Err(io::Error::new(
             io::ErrorKind::InvalidInput,
             "string contains embedded null",
         ));
     }
-    buf.put_slice(s);
+    buf.put_slice(bytes);
     buf.put_u8(0);
     Ok(())
 }
@@ -644,7 +664,7 @@ impl<'a> BeMessage<'a> {
                         Methods(methods) => {
                             buf.put_i32(10); // Specifies that SASL auth method is used.
                             for method in methods.iter() {
-                                write_cstr(method.as_bytes(), buf)?;
+                                write_cstr(method, buf)?;
                             }
                             buf.put_u8(0); // zero terminator for the list
                         }
@@ -759,7 +779,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"CXX000\0");
 
                     buf.put_u8(b'M'); // the message
-                    write_cstr(error_msg.as_bytes(), buf)?;
+                    write_cstr(error_msg, buf)?;
 
                     buf.put_u8(0); // terminator
                     Ok::<_, io::Error>(())
@@ -799,24 +819,12 @@ impl<'a> BeMessage<'a> {
                 buf.put_u8(response);
             }
 
-            BeMessage::ParameterStatus(param) => {
-                use std::io::{IoSlice, Write};
-                use BeParameterStatusMessage::*;
-
-                let [name, value] = match param {
-                    Encoding(name) => [b"client_encoding", name.as_bytes()],
-                    ServerVersion(version) => [b"server_version", version.as_bytes()],
-                };
-
-                // Parameter names and values are passed as null-terminated strings
-                let iov = &mut [name, b"\0", value, b"\0"].map(IoSlice::new);
-                let mut buffer = [0u8; 64]; // this should be enough
-                let cnt = buffer.as_mut().write_vectored(iov).unwrap();
-
+            BeMessage::ParameterStatus { name, value } => {
                 buf.put_u8(b'S');
                 write_body(buf, |buf| {
-                    buf.put_slice(&buffer[..cnt]);
-                });
+                    write_cstr(name, buf)?;
+                    write_cstr(value, buf)
+                })?;
             }
 
             BeMessage::ParameterDescription => {
diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
index 89f7197718..5b34c7adfb 100644
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -6,7 +6,7 @@
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
@@ -361,11 +361,9 @@ impl PostgresBackend {
                         match self.auth_type {
                             AuthType::Trust => {
                                 self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                                    .write_message_noflush(&BeParameterStatusMessage::encoding())?
+                                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                                     // The async python driver requires a valid server_version
-                                    .write_message_noflush(&BeMessage::ParameterStatus(
-                                        BeParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message_noflush(&BeMessage::server_version("14.1"))?
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
@@ -413,7 +411,7 @@ impl PostgresBackend {
                     }
                 }
                 self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                    .write_message_noflush(&BeParameterStatusMessage::encoding())?
+                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
                     .write_message(&BeMessage::ReadyForQuery)?;
                 self.state = ProtoState::Established;
             }
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index 376819027b..a22774c69e 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -6,7 +6,7 @@
 use crate::postgres_backend::AuthType;
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
@@ -331,11 +331,9 @@ impl PostgresBackend {
                         match self.auth_type {
                             AuthType::Trust => {
                                 self.write_message(&BeMessage::AuthenticationOk)?
-                                    .write_message(&BeParameterStatusMessage::encoding())?
+                                    .write_message(&BeMessage::CLIENT_ENCODING)?
                                     // The async python driver requires a valid server_version
-                                    .write_message(&BeMessage::ParameterStatus(
-                                        BeParameterStatusMessage::ServerVersion("14.1"),
-                                    ))?
+                                    .write_message(&BeMessage::server_version("14.1"))?
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
@@ -384,7 +382,7 @@ impl PostgresBackend {
                     }
                 }
                 self.write_message(&BeMessage::AuthenticationOk)?
-                    .write_message(&BeParameterStatusMessage::encoding())?
+                    .write_message(&BeMessage::CLIENT_ENCODING)?
                     .write_message(&BeMessage::ReadyForQuery)?;
                 self.state = ProtoState::Established;
             }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 54bbe4714d..9a9bb9bf08 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,9 +36,9 @@ nix = "0.25"
 num-traits = "0.2.15"
 once_cell = "1.13.0"
 pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
@@ -52,7 +52,7 @@ svg_fmt = "0.4.1"
 tar = "0.4.33"
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.36"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 14a5450d5e..68004e5fe2 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -33,7 +33,7 @@ sha2 = "0.10.2"
 socket2 = "0.4.4"
 thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 tokio-rustls = "0.23.0"
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 440a55f194..641519ac50 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,6 @@
 use super::{AuthSuccess, NodeInfo};
 use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
-use pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};
@@ -60,7 +60,7 @@ pub async fn handle_user(
         info!(parent: &span, "sending the auth URL to the user");
         client
             .write_message_noflush(&Be::AuthenticationOk)?
-            .write_message_noflush(&BeParameterStatusMessage::encoding())?
+            .write_message_noflush(&Be::CLIENT_ENCODING)?
             .write_message(&Be::NoticeResponse(&greeting))
             .await?;
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 4c5edb9673..71421a4a65 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,18 +8,17 @@ use tokio::net::TcpStream;
 use tokio_postgres::NoTls;
 use tracing::{error, info};
 
+const COULD_NOT_CONNECT: &str = "Could not connect to compute node";
+
 #[derive(Debug, Error)]
 pub enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
     /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such.
-    #[error("Failed to connect to the compute node: {0}")]
+    #[error("{COULD_NOT_CONNECT}: {0}")]
     Postgres(#[from] tokio_postgres::Error),
 
-    #[error("Failed to connect to the compute node")]
-    FailedToConnectToCompute,
-
-    #[error("Failed to fetch compute node version")]
-    FailedToFetchPgVersion,
+    #[error("{COULD_NOT_CONNECT}: {0}")]
+    CouldNotConnect(#[from] io::Error),
 }
 
 impl UserFacingError for ConnectionError {
@@ -29,10 +28,10 @@ impl UserFacingError for ConnectionError {
             // This helps us drop irrelevant library-specific prefixes.
             // TODO: propagate severity level and other parameters.
             Postgres(err) => match err.as_db_error() {
-                Some(err) => err.message().to_string(),
+                Some(err) => err.message().to_owned(),
                 None => err.to_string(),
             },
-            other => other.to_string(),
+            _ => COULD_NOT_CONNECT.to_owned(),
         }
     }
 }
@@ -49,7 +48,7 @@ pub struct ConnCfg(pub tokio_postgres::Config);
 impl ConnCfg {
     /// Construct a new connection config.
     pub fn new() -> Self {
-        Self(tokio_postgres::Config::new())
+        Self(Default::default())
     }
 }
 
@@ -95,7 +94,7 @@ impl ConnCfg {
                 io::ErrorKind::Other,
                 format!(
                     "couldn't connect: bad compute config, \
-                        ports and hosts entries' count does not match: {:?}",
+                     ports and hosts entries' count does not match: {:?}",
                     self.0
                 ),
             ));
@@ -131,8 +130,8 @@ impl ConnCfg {
 pub struct PostgresConnection {
     /// Socket connected to a compute node.
     pub stream: TcpStream,
-    /// PostgreSQL version of this instance.
-    pub version: String,
+    /// PostgreSQL connection parameters.
+    pub params: std::collections::HashMap<String, String>,
 }
 
 impl ConnCfg {
@@ -156,6 +155,7 @@ impl ConnCfg {
             self.0.application_name(app_name);
         }
 
+        // TODO: This is especially ugly...
         if let Some(replication) = params.get("replication") {
             use tokio_postgres::config::ReplicationMode;
             match replication {
@@ -172,22 +172,24 @@ impl ConnCfg {
         // TODO: extend the list of the forwarded startup parameters.
         // Currently, tokio-postgres doesn't allow us to pass
         // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
 
-        let (socket_addr, mut stream) = self
-            .connect_raw()
-            .await
-            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;
-
-        // TODO: establish a secure connection to the DB
-        let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?;
-        let version = conn
-            .parameter("server_version")
-            .ok_or(ConnectionError::FailedToFetchPgVersion)?
-            .into();
-
+        // TODO: establish a secure connection to the DB.
+        let (socket_addr, mut stream) = self.connect_raw().await?;
+        let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?;
         info!("connected to user's compute node at {socket_addr}");
+
+        // This is very ugly but as of now there's no better way to
+        // extract the connection parameters from tokio-postgres' connection.
+        // TODO: solve this problem in a more elegant manner (e.g. the new library).
+        let params = connection.parameters;
+
+        // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
+        // Yet another reason to rework the connection establishing code.
         let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());
-        let db = PostgresConnection { stream, version };
+        let db = PostgresConnection { stream, params };
 
         Ok((db, cancel_closure))
     }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index da3cb144e3..713388c625 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -255,15 +255,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
         // Note that we do this only (for the most part) after we've connected
         // to a compute (see above) which performs its own authentication.
         if !auth_result.reported_auth_ok {
-            stream
-                .write_message_noflush(&Be::AuthenticationOk)?
-                .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+            stream.write_message_noflush(&Be::AuthenticationOk)?;
+        }
+
+        // Forward all postgres connection params to the client.
+        // Right now the implementation is very hacky and inefficent (ideally,
+        // we don't need an intermediate hashmap), but at least it should be correct.
+        for (name, value) in &db.params {
+            // TODO: Theoretically, this could result in a big pile of params...
+            stream.write_message_noflush(&Be::ParameterStatus {
+                name: name.as_bytes(),
+                value: value.as_bytes(),
+            })?;
         }
 
         stream
-            .write_message_noflush(&BeMessage::ParameterStatus(
-                BeParameterStatusMessage::ServerVersion(&db.version),
-            ))?
             .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
             .write_message(&BeMessage::ReadyForQuery)
             .await?;
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 24fbc57b99..2f023844d0 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -139,7 +139,7 @@ async fn dummy_proxy(
 
     stream
         .write_message_noflush(&Be::AuthenticationOk)?
-        .write_message_noflush(&BeParameterStatusMessage::encoding())?
+        .write_message_noflush(&Be::CLIENT_ENCODING)?
         .write_message(&BeMessage::ReadyForQuery)
         .await?;
 
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index d11ef1711a..72a51ec443 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -20,8 +20,8 @@ hyper = "0.14"
 nix = "0.25"
 once_cell = "1.13.0"
 parking_lot = "0.12.1"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 regex = "1.4.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -29,7 +29,7 @@ serde_with = "2.0"
 signal-hook = "0.3.10"
 thiserror = "1"
 tokio = { version = "1.17", features = ["macros", "fs"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.27"
 url = "2.2.2"
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index eab9505fbb..4d2b63d360 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -122,3 +122,33 @@ def test_auth_errors(static_proxy: NeonProxy):
     # Finally, check that the user can connect
     with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
         pass
+
+
+def test_forward_params_to_client(static_proxy: NeonProxy):
+    # A subset of parameters (GUCs) which postgres
+    # sends to the client during connection setup.
+    # Unfortunately, `GUC_REPORT` can't be queried.
+    # Proxy *should* forward them, otherwise client library
+    # might misbehave (e.g. parse timestamps incorrectly).
+    reported_params_subset = [
+        "client_encoding",
+        "integer_datetimes",
+        "is_superuser",
+        "server_encoding",
+        "server_version",
+        "session_authorization",
+        "standard_conforming_strings",
+    ]
+
+    query = """
+        select name, setting
+        from pg_catalog.pg_settings
+        where name = any(%s)
+    """
+
+    with static_proxy.connect(options="project=irrelevant") as conn:
+        with conn.cursor() as cur:
+            cur.execute(query, (reported_params_subset,))
+            for name, value in cur.fetchall():
+                # Check that proxy has forwarded this parameter.
+                assert conn.get_parameter_status(name) == value

From 3514e6e89a2ddd100057f5820a5cfd0f203cd3f3 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Fri, 16 Dec 2022 21:14:57 +0100
Subject: [PATCH 018/132] Use neon_nblocks instead of get_cached_relsize
 (#3132)

This prevents us from overwriting all blocks of a relation when we
extend the relation without first caching the size - get_cached_relsize
does not guarantee a correct result when it returns `false`.
---
 pgxn/neon/pagestore_smgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 73bf330baf..900f44ca10 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1669,7 +1669,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
 	 * call smgrextend for destination relation n using size of source relation
 	 */
-	get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks);
+	n_blocks = neon_nblocks(reln, forkNum);
 	while (n_blocks < blkno)
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
 

From 61194ab2f430df3eb5969477bb1a1861456ec136 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Fri, 16 Dec 2022 21:58:41 +0300
Subject: [PATCH 019/132] Update rust-postgres everywhere

I've rebased[1] Neon's fork of rust-postgres to incorporate
latest upstream changes (including dependabot's fixes),
so we need to advance revs here as well.

[1] https://github.com/neondatabase/rust-postgres/commits/neon
---
 Cargo.lock                             | 53 +++++++++++++++-----------
 Cargo.toml                             |  2 +-
 compute_tools/Cargo.toml               |  4 +-
 control_plane/Cargo.toml               |  2 +-
 libs/postgres_connection/Cargo.toml    |  4 +-
 libs/postgres_ffi/Cargo.toml           |  2 +-
 libs/postgres_ffi/wal_craft/Cargo.toml |  2 +-
 libs/pq_proto/Cargo.toml               |  2 +-
 pageserver/Cargo.toml                  |  8 ++--
 proxy/Cargo.toml                       |  2 +-
 safekeeper/Cargo.toml                  |  6 +--
 11 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1eb27fb0f9..665000746d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -563,6 +563,12 @@ version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
 
+[[package]]
+name = "base64"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -1920,7 +1926,7 @@ version = "8.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "pem",
  "ring",
  "serde",
@@ -2507,7 +2513,7 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]
 
 [[package]]
@@ -2528,18 +2534,18 @@ dependencies = [
 
 [[package]]
 name = "phf"
-version = "0.10.1"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
+checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c"
 dependencies = [
  "phf_shared",
 ]
 
 [[package]]
 name = "phf_shared"
-version = "0.10.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
+checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676"
 dependencies = [
  "siphasher",
 ]
@@ -2612,12 +2618,12 @@ dependencies = [
 
 [[package]]
 name = "postgres"
-version = "0.19.2"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+version = "0.19.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "bytes",
  "fallible-iterator",
- "futures",
+ "futures-util",
  "log",
  "tokio",
  "tokio-postgres",
@@ -2626,9 +2632,9 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
- "base64",
+ "base64 0.20.0",
  "byteorder",
  "bytes",
  "fallible-iterator",
@@ -2643,8 +2649,8 @@ dependencies = [
 
 [[package]]
 name = "postgres-types"
-version = "0.2.3"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+version = "0.2.4"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2868,7 +2874,7 @@ dependencies = [
  "anyhow",
  "async-trait",
  "atty",
- "base64",
+ "base64 0.13.1",
  "bstr",
  "bytes",
  "clap 4.0.29",
@@ -3078,7 +3084,7 @@ version = "0.11.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "bytes",
  "encoding_rs",
  "futures-core",
@@ -3261,7 +3267,7 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
 dependencies = [
- "base64",
+ "base64 0.13.1",
 ]
 
 [[package]]
@@ -3542,7 +3548,7 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "chrono",
  "hex",
  "indexmap",
@@ -4009,14 +4015,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres"
-version = "0.7.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=69c1ef71cd5418cf063d4ca21eadc3427980caea#69c1ef71cd5418cf063d4ca21eadc3427980caea"
+version = "0.7.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e"
 dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
  "fallible-iterator",
- "futures",
+ "futures-channel",
+ "futures-util",
  "log",
  "parking_lot 0.12.1",
  "percent-encoding",
@@ -4109,7 +4116,7 @@ dependencies = [
  "async-stream",
  "async-trait",
  "axum",
- "base64",
+ "base64 0.13.1",
  "bytes",
  "futures-core",
  "futures-util",
@@ -4351,7 +4358,7 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f"
 dependencies = [
- "base64",
+ "base64 0.13.1",
  "chunked_transfer",
  "log",
  "native-tls",
@@ -4787,7 +4794,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8"
 dependencies = [
  "asn1-rs",
- "base64",
+ "base64 0.13.1",
  "data-encoding",
  "der-parser",
  "lazy_static",
diff --git a/Cargo.toml b/Cargo.toml
index 0e098d91ee..927900d5c8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,4 +86,4 @@ lto = true
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 [patch.crates-io]
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 6240073cb3..c40d870649 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -12,12 +12,12 @@ futures = "0.3.13"
 hyper = { version = "0.14", features = ["full"] }
 log = { version = "0.4", features = ["std", "serde"] }
 notify = "5.0.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 9d9d6a5f11..180508a01a 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,7 +10,7 @@ comfy-table = "6.1"
 git-version = "0.3.5"
 nix = "0.25"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 25db64337d..1924b260fa 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -8,8 +8,8 @@ edition = "2021"
 [dependencies]
 anyhow = "1.0"
 itertools = "0.10.3"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index bafc587e80..59eec3de32 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [dev-dependencies]
 env_logger = "0.9"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 wal_craft = { path = "wal_craft" }
 
 [build-dependencies]
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 3a22e9d789..dd9f82a87a 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -11,7 +11,7 @@ clap = "4.0"
 env_logger = "0.9"
 log = "0.4"
 once_cell = "1.13.0"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres_ffi = { path = "../" }
 tempfile = "3.2"
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index dc38abd64b..76d8fbf28d 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 anyhow = "1.0"
 bytes = "1.0.1"
 pin-project-lite = "0.2.7"
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9a9bb9bf08..24642ca2f7 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,9 +36,9 @@ nix = "0.25"
 num-traits = "0.2.15"
 once_cell = "1.13.0"
 pin-project-lite = "0.2.7"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
@@ -52,7 +52,7 @@ svg_fmt = "0.4.1"
 tar = "0.4.33"
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.36"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 68004e5fe2..e630b2758d 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -33,7 +33,7 @@ sha2 = "0.10.2"
 socket2 = "0.4.4"
 thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-rustls = "0.23.0"
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 72a51ec443..fbcb3f34f7 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -20,8 +20,8 @@ hyper = "0.14"
 nix = "0.25"
 once_cell = "1.13.0"
 parking_lot = "0.12.1"
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 regex = "1.4.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -29,7 +29,7 @@ serde_with = "2.0"
 signal-hook = "0.3.10"
 thiserror = "1"
 tokio = { version = "1.17", features = ["macros", "fs"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="69c1ef71cd5418cf063d4ca21eadc3427980caea" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 toml_edit = { version = "0.14", features = ["easy"] }
 tracing = "0.1.27"
 url = "2.2.2"

From 12e6f443dae80f316832cf81d83b4f71eb17bbc9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 18 Dec 2022 00:02:04 +0000
Subject: [PATCH 020/132] test_perf_pgbench: switch to server-side data
 generation (#3058)

To offload the network and reduce its impact, I suggest switching to
server-side data generation for the pgbench initialize workflow.
---
 test_runner/fixtures/benchmark_fixture.py    | 63 ++++++++++++--------
 test_runner/performance/test_perf_pgbench.py |  6 +-
 2 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 27fb0a60b2..b1489b7ab1 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -11,7 +11,7 @@ from datetime import datetime
 from pathlib import Path
 
 # Type-related stuff
-from typing import Callable, ClassVar, Iterator, Optional
+from typing import Callable, ClassVar, Dict, Iterator, Optional
 
 import pytest
 from _pytest.config import Config
@@ -135,23 +135,26 @@ class PgBenchRunResult:
 
 @dataclasses.dataclass
 class PgBenchInitResult:
-    REGEX: ClassVar[re.Pattern] = re.compile(  # type: ignore[type-arg]
-        r"done in (\d+\.\d+) s "
-        r"\("
-        r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
-        r"(?:create tables (\d+\.\d+) s)?(?:, )?"
-        r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
-        r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
-        r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
-        r"\)\."
-    )
+    # Taken from https://github.com/postgres/postgres/blob/REL_15_1/src/bin/pgbench/pgbench.c#L5144-L5171
+    EXTRACTORS: ClassVar[Dict[str, re.Pattern]] = {  # type: ignore[type-arg]
+        "drop_tables": re.compile(r"drop tables (\d+\.\d+) s"),
+        "create_tables": re.compile(r"create tables (\d+\.\d+) s"),
+        "client_side_generate": re.compile(r"client-side generate (\d+\.\d+) s"),
+        "server_side_generate": re.compile(r"server-side generate (\d+\.\d+) s"),
+        "vacuum": re.compile(r"vacuum (\d+\.\d+) s"),
+        "primary_keys": re.compile(r"primary keys (\d+\.\d+) s"),
+        "foreign_keys": re.compile(r"foreign keys (\d+\.\d+) s"),
+        "total": re.compile(r"done in (\d+\.\d+) s"),  # Total time printed by pgbench
+    }
 
-    total: float
+    total: Optional[float]
     drop_tables: Optional[float]
     create_tables: Optional[float]
     client_side_generate: Optional[float]
+    server_side_generate: Optional[float]
     vacuum: Optional[float]
     primary_keys: Optional[float]
+    foreign_keys: Optional[float]
     duration: float
     start_timestamp: int
     end_timestamp: int
@@ -164,25 +167,35 @@ class PgBenchInitResult:
         start_timestamp: int,
         end_timestamp: int,
     ):
-        # Parses pgbench initialize output for default initialization steps (dtgvp)
+        # Parses pgbench initialize output
         # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s).
 
         last_line = stderr.splitlines()[-1]
 
-        if (m := cls.REGEX.match(last_line)) is not None:
-            total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [
-                float(v) for v in m.groups() if v is not None
-            ]
-        else:
+        timings: Dict[str, Optional[float]] = {}
+        last_line_items = re.split(r"\(|\)|,", last_line)
+        for item in last_line_items:
+            for key, regex in cls.EXTRACTORS.items():
+                if (m := regex.match(item.strip())) is not None:
+                    if key in timings:
+                        raise RuntimeError(
+                            f"can't store pgbench results for repeated action `{key}`"
+                        )
+
+                    timings[key] = float(m.group(1))
+
+        if not timings or "total" not in timings:
             raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`")
 
         return cls(
-            total=total,
-            drop_tables=drop_tables,
-            create_tables=create_tables,
-            client_side_generate=client_side_generate,
-            vacuum=vacuum,
-            primary_keys=primary_keys,
+            total=timings["total"],
+            drop_tables=timings.get("drop_tables", 0.0),
+            create_tables=timings.get("create_tables", 0.0),
+            client_side_generate=timings.get("client_side_generate", 0.0),
+            server_side_generate=timings.get("server_side_generate", 0.0),
+            vacuum=timings.get("vacuum", 0.0),
+            primary_keys=timings.get("primary_keys", 0.0),
+            foreign_keys=timings.get("foreign_keys", 0.0),
             duration=duration,
             start_timestamp=start_timestamp,
             end_timestamp=end_timestamp,
@@ -326,8 +339,10 @@ class NeonBenchmarker:
             "drop_tables",
             "create_tables",
             "client_side_generate",
+            "server_side_generate",
             "vacuum",
             "primary_keys",
+            "foreign_keys",
         ]
         for metric in metrics:
             if (value := getattr(result, metric)) is not None:
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index 015cc40a72..50e5366c1e 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -15,7 +15,7 @@ from fixtures.utils import get_scale_for_db
 @enum.unique
 class PgBenchLoadType(enum.Enum):
     INIT = "init"
-    SIMPLE_UPDATE = "simple_update"
+    SIMPLE_UPDATE = "simple-update"
     SELECT_ONLY = "select-only"
 
 
@@ -94,7 +94,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
 
     if workload_type == PgBenchLoadType.INIT:
         # Run initialize
-        init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password)
+        init_pgbench(
+            env, ["pgbench", f"-s{scale}", "-i", "-I", "dtGvp", connstr], password=password
+        )
 
     if workload_type == PgBenchLoadType.SIMPLE_UPDATE:
         # Run simple-update workload

From e23d5da51cf69935fad9ae1db83fb07dc6996181 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 10:52:16 +0200
Subject: [PATCH 021/132] Tidy up and add comments to the pageserver startup
 code.

To make it more readable.
---
 pageserver/src/bin/pageserver.rs | 70 ++++++++++++++------------------
 1 file changed, 30 insertions(+), 40 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 345f391e61..47e9382e6d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -201,8 +201,12 @@ fn initialize_config(
 }
 
 fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    // Initialize logging
     logging::init(conf.log_format)?;
+
+    // Print version to the log, and expose it as a prometheus metric too.
     info!("version: {}", version());
+    set_build_info_metric(GIT_VERSION);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
@@ -218,38 +222,37 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         )
     }
 
+    // Create and lock PID file. This ensures that there cannot be more than one
+    // pageserver process running at the same time.
     let lock_file_path = conf.workdir.join(PID_FILE_NAME);
     let lock_file =
         utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
     info!("Claimed pid file at {lock_file_path:?}");
 
-    // ensure that the lock file is held even if the main thread of the process is panics
-    // we need to release the lock file only when the current process is gone
+    // Ensure that the lock file is held even if the main thread of the process panics.
+    // We need to release the lock file only when the process exits.
     std::mem::forget(lock_file);
 
-    // TODO: Check that it looks like a valid repository before going further
+    // Bind the HTTP and libpq ports early, so that if they are in use by some other
+    // process, we error out early.
+    let http_addr = &conf.listen_http_addr;
+    info!("Starting pageserver http handler on {http_addr}");
+    let http_listener = tcp_listener::bind(http_addr)?;
 
-    // bind sockets before daemonizing so we report errors early and do not return until we are listening
-    info!(
-        "Starting pageserver http handler on {}",
-        conf.listen_http_addr
-    );
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
-
-    info!(
-        "Starting pageserver pg protocol handler on {}",
-        conf.listen_pg_addr
-    );
-    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
+    let pg_addr = &conf.listen_pg_addr;
+    info!("Starting pageserver pg protocol handler on {pg_addr}");
+    let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
+    // Install signal handlers
     let signals = signals::install_shutdown_handlers()?;
 
-    // start profiler (if enabled)
+    // Start profiler (if enabled)
     let profiler_guard = profiling::init_profiler(conf);
 
+    // Launch broker client
     WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
 
-    // initialize authentication for incoming connections
+    // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
         AuthType::Trust | AuthType::MD5 => None,
         AuthType::NeonJWT => {
@@ -277,6 +280,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         }
     };
 
+    // Set up remote storage client
     let remote_storage = conf
         .remote_storage_config
         .as_ref()
@@ -284,30 +288,18 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         .transpose()
         .context("Failed to init generic remote storage")?;
 
-    let (init_result_sender, init_result_receiver) =
-        std::sync::mpsc::channel::<anyhow::Result<()>>();
-    let storage_for_spawn = remote_storage.clone();
-    let _handler = BACKGROUND_RUNTIME.spawn(async move {
-        let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
-        init_result_sender.send(result)
-    });
-    match init_result_receiver.recv() {
-        Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
-        Err(_sender_dropped_err) => {
-            anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
-        }
-    }
+    // Scan the local 'tenants/' directory and start loading the tenants
+    BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
 
-    // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
-    // bind before launching separate thread so the error reported before startup exits
-
-    // Create a Service from the router above to handle incoming requests.
+    // Start up the service to handle HTTP mgmt API request. We created the
+    // listener earlier already.
     {
         let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
 
-        let router = http::make_router(conf, auth.clone(), remote_storage)?;
-        let service =
-            utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
+        let router = http::make_router(conf, auth.clone(), remote_storage)?
+            .build()
+            .map_err(|err| anyhow!(err))?;
+        let service = utils::http::RouterService::new(router).unwrap();
         let server = hyper::Server::from_tcp(http_listener)?
             .serve(service)
             .with_graceful_shutdown(task_mgr::shutdown_watcher());
@@ -327,7 +319,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
-    // for each connection.
+    // for each connection. We created the listener earlier already.
     task_mgr::spawn(
         COMPUTE_REQUEST_RUNTIME.handle(),
         TaskKind::LibpqEndpointListener,
@@ -340,8 +332,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         },
     );
 
-    set_build_info_metric(GIT_VERSION);
-
     // All started up! Now just sit and wait for shutdown signal.
     signals.handle(|signal| match signal {
         Signal::Quit => {

From c785a516aa6230a36fe48c9a96eb2a802125d5ad Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 19 Dec 2022 13:02:42 +0100
Subject: [PATCH 022/132] remove TimelineInfo.{Remote,Local} along with their
 types

follow-up of https://github.com/neondatabase/neon/pull/2615
which is neon.git: 538876650a0c303aeae4fac71336a3d62aa6da28

must be deployed after cloud.git change
https://github.com/neondatabase/cloud/issues/3232

fixes https://github.com/neondatabase/neon/issues/3041
---
 libs/pageserver_api/src/models.rs    | 23 ---------------------
 pageserver/src/http/openapi_spec.yml | 31 ----------------------------
 pageserver/src/http/routes.rs        | 16 ++------------
 3 files changed, 2 insertions(+), 68 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e49b7051d2..586ce2a73a 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -203,29 +203,6 @@ pub struct TimelineInfo {
     pub pg_version: u32,
 
     pub state: TimelineState,
-
-    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
-    // compatility with older clients.
-    pub local: LocalTimelineInfo,
-    pub remote: RemoteTimelineInfo,
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct LocalTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_lsn: Option<Lsn>,
-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
-    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct RemoteTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub remote_consistent_lsn: Option<Lsn>,
 }
 
 pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b372410c0d..67cf4ea326 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -795,37 +795,6 @@ components:
         latest_gc_cutoff_lsn:
           type: string
           format: hex
-
-        # These 'local' and 'remote' fields just duplicate some of the fields
-        # above. They are kept for backwards-compatibility. They can be removed,
-        # when the control plane has been updated to look at the above fields
-        # directly.
-        local:
-          $ref: "#/components/schemas/LocalTimelineInfo"
-        remote:
-          $ref: "#/components/schemas/RemoteTimelineInfo"
-
-    LocalTimelineInfo:
-      type: object
-      properties:
-        ancestor_timeline_id:
-          type: string
-          format: hex
-        ancestor_lsn:
-          type: string
-          format: hex
-        current_logical_size:
-          type: integer
-        current_physical_size:
-          type: integer
-    RemoteTimelineInfo:
-      type: object
-      required:
-        - remote_consistent_lsn
-      properties:
-        remote_consistent_lsn:
-          type: string
-          format: hex
     Error:
       type: object
       required:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0ef555c4aa..40d2a0e0ef 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,8 +7,8 @@ use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
 use super::models::{
-    LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
-    TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
+    TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::tenant::Timeline;
@@ -147,18 +147,6 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         pg_version: timeline.pg_version,
 
         state,
-
-        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
-        // with the control plane.
-        local: LocalTimelineInfo {
-            ancestor_timeline_id,
-            ancestor_lsn,
-            current_logical_size,
-            current_physical_size,
-        },
-        remote: RemoteTimelineInfo {
-            remote_consistent_lsn: Some(remote_consistent_lsn),
-        },
     };
     Ok(info)
 }

From ee2b5dc9ac0bebfa3d10da0ab0418c576581d949 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 12:15:23 +0100
Subject: [PATCH 023/132] [1/4] initial logical size calculation: if it fails,
 retry on next call

Before this patch, if the task fails, we would not reset
self.initial_size_computation_started.
So, if it fails, we will return the approximate value forever.

In practice, it probably never failed because the local filesystem
is quite reliable.

But with on-demand download, the logical size calculation may need
to download layers, which is more likely to fail at times.
There will be internal retires with a timeout, but eventually,
the downloads will give up.
We want to retry in those cases.

While we're at it, also change the handling of the timeline state
watch so that we treat it as an error. Most likely, we'll not be
called again, but if we are, retrying is the right thing.
---
 pageserver/src/tenant/timeline.rs | 146 +++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 55 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index cc6583dcf6..b61ef09c46 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -15,7 +15,7 @@ use std::collections::HashMap;
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::{Duration, Instant, SystemTime};
 
@@ -176,7 +176,7 @@ pub struct Timeline {
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
-    initial_size_computation_started: AtomicBool,
+    initial_size_computation_state: Mutex<InitialLogicalSizeComputationState>,
 
     /// Information about the last processed message by the WAL receiver,
     /// or None if WAL receiver has not received anything for this timeline
@@ -189,6 +189,14 @@ pub struct Timeline {
     state: watch::Sender<TimelineState>,
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum InitialLogicalSizeComputationState {
+    NotStarted,
+    Running,
+    FailedWillRetryNextTime,
+    Success,
+}
+
 /// Internal structure to hold all data needed for logical size calculation.
 /// Calculation consists of two parts:
 /// 1.  Initial size calculation. That might take a long time, because it requires
@@ -804,7 +812,9 @@ impl Timeline {
                 // initial logical size is 0.
                 LogicalSize::empty_initial()
             },
-            initial_size_computation_started: AtomicBool::new(false),
+            initial_size_computation_state: Mutex::new(
+                InitialLogicalSizeComputationState::NotStarted,
+            ),
             partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
             repartition_threshold: 0,
 
@@ -1221,59 +1231,85 @@ impl Timeline {
     }
 
     fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
-        // Atomically check if the timeline size calculation had already started.
-        // If the flag was not already set, this sets it.
-        if !self
-            .initial_size_computation_started
-            .swap(true, AtomicOrdering::SeqCst)
-        {
-            // We need to start the computation task.
-            let self_clone = Arc::clone(self);
-            task_mgr::spawn(
-                task_mgr::BACKGROUND_RUNTIME.handle(),
-                task_mgr::TaskKind::InitialLogicalSizeCalculation,
-                Some(self.tenant_id),
-                Some(self.timeline_id),
-                "initial size calculation",
-                false,
-                async move {
-                    let mut timeline_state_updates = self_clone.subscribe_for_state_updates();
-                    let self_calculation = Arc::clone(&self_clone);
-                    tokio::select! {
-                        calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
-                            let calculated_size = calculation_result
-                                .context("Failed to spawn calculation result task")?
-                                .context("Failed to calculate logical size")?;
-                            match self_clone.current_logical_size.initial_logical_size.set(calculated_size) {
-                                Ok(()) => info!("Successfully calculated initial logical size"),
-                                Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"),
-                            }
-                            Ok(())
-                        },
-                        new_event = async {
-                            loop {
-                                match timeline_state_updates.changed().await {
-                                    Ok(()) => {
-                                        let new_state = *timeline_state_updates.borrow();
-                                        match new_state {
-                                            // we're running this job for active timelines only
-                                            TimelineState::Active => continue,
-                                            TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
-                                        }
-                                    }
-                                    Err(_sender_dropped_error) => return None,
-                                }
-                            }
-                        } => {
-                            match new_event {
-                                Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"),
-                                None => info!("Timeline dropped state updates sender, stopping init size calculation"),
-                            }
-                            Ok(())
-                        },
+        use InitialLogicalSizeComputationState::*;
+        let mut guard = self.initial_size_computation_state.lock().unwrap();
+        match *guard {
+            Running | Success => return,
+            NotStarted | FailedWillRetryNextTime => *guard = Running,
+        }
+        drop(guard);
+        // We need to start the computation task.
+        let self_clone = Arc::clone(self);
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "initial size calculation",
+            false,
+            async move {
+                let res = self_clone
+                    .initial_logical_size_calculation_task(init_lsn)
+                    .await;
+                // task_mgr will log the result
+                let new_state = match res {
+                    Ok(_) => Success,
+                    Err(_) => FailedWillRetryNextTime,
+                };
+                let mut state = self_clone.initial_size_computation_state.lock().unwrap();
+                if *state != Running {
+                    // Should be unreachable, but no reason to crash the pageserver. Don't touch anything.
+                    error!("expecting initial size computation task to be in state {Running:?}, got {state:?}")
+                } else {
+                    *state = new_state;
+                }
+                res
+            },
+        );
+    }
+
+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    async fn initial_logical_size_calculation_task(
+        self: &Arc<Self>,
+        init_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let self_calculation = Arc::clone(self);
+        tokio::select! {
+            calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
+                let calculated_size = calculation_result
+                    .context("Failed to spawn calculation result task")?
+                    .context("Failed to calculate logical size")?;
+                match self.current_logical_size.initial_logical_size.set(calculated_size) {
+                    Ok(()) => (),
+                    Err(existing_size) => {
+                        // This shouldn't happen because we use self.initial_size_computation_running to ensure exlusivity here.
+                        // But if it happens, just complain & report success so there are no further retries.
+                        error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
                     }
-                }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)),
-            );
+                }
+                Ok(())
+            },
+            new_event = async {
+                loop {
+                    match timeline_state_updates.changed().await {
+                        Ok(()) => {
+                            let new_state = *timeline_state_updates.borrow();
+                            match new_state {
+                                // we're running this job for active timelines only
+                                TimelineState::Active => continue,
+                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
+                            }
+                        }
+                        Err(_sender_dropped_error) => return None,
+                    }
+                }
+            } => {
+                match new_event {
+                    Some(new_state) => anyhow::bail!("aborted because timeline became inactive (new state: {new_state:?})"),
+                    None => anyhow::bail!("aborted because state watch was dropped"), // can't happen, the sender is not dropped as long as the Timeline exists
+                }
+            },
         }
     }
 

From 40a3d508834e60860f8888ab68a357eba178c138 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 15:17:13 +0100
Subject: [PATCH 024/132] [2/4] add test to show that tenant detach makes us
 leak running size calculation task

---
 pageserver/src/tenant/timeline.rs         | 21 ++++++
 test_runner/regress/test_timeline_size.py | 84 ++++++++++++++++++++++-
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b61ef09c46..e957878472 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1321,6 +1321,27 @@ impl Timeline {
             "Calculating logical size for timeline {} at {}",
             self.timeline_id, up_to_lsn
         );
+        // These failpoints are used by python tests to ensure that we don't delete
+        // the timeline while the logical size computation is ongoing.
+        // The first failpoint is used to make this function pause.
+        // Then the python test initiates timeline delete operation in a thread.
+        // It waits for a few seconds, then arms the second failpoint and disables
+        // the first failpoint. The second failpoint prints an error if the timeline
+        // delete code has deleted the on-disk state while we're still running here.
+        // It shouldn't do that. If it does it anyway, the error will be caught
+        // by the test suite, highlighting the problem.
+        fail::fail_point!("timeline-calculate-logical-size-pause");
+        fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
+            if !self
+                .conf
+                .metadata_path(self.timeline_id, self.tenant_id)
+                .exists()
+            {
+                error!("timeline-calculate-logical-size-pre metadata file does not exist")
+            }
+            // need to return something
+            Ok(0)
+        });
         let timer = if up_to_lsn == self.initdb_lsn {
             if let Some(size) = self.current_logical_size.initialized_size() {
                 if size != 0 {
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4b70c2ea18..e881608a44 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,6 +1,8 @@
 import math
+import queue
 import random
 import re
+import threading
 import time
 from contextlib import closing
 from pathlib import Path
@@ -11,6 +13,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    PageserverApiException,
     PageserverHttpClient,
     PgBin,
     PortDistributor,
@@ -19,7 +22,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import get_timeline_dir_size
+from fixtures.utils import get_timeline_dir_size, wait_until
 
 
 def test_timeline_size(neon_simple_env: NeonEnv):
@@ -213,6 +216,85 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value"
 
 
+def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    client = env.pageserver.http_client()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    # load in some data
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    pg.safe_psql_many(
+        [
+            "CREATE TABLE foo (x INTEGER)",
+            "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g",
+        ]
+    )
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pg.stop()
+
+    # restart with failpoint inside initial size calculation task
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
+    wait_until(30, 1, tenant_active)
+
+    # kick off initial size calculation task (the response we get here is the estimated size)
+    def assert_size_calculation_not_done():
+        details = client.timeline_detail(
+            tenant_id, timeline_id, include_non_incremental_logical_size=True
+        )
+        assert details["current_logical_size"] != details["current_logical_size_non_incremental"]
+
+    assert_size_calculation_not_done()
+    # ensure we're really stuck
+    time.sleep(5)
+    assert_size_calculation_not_done()
+
+    log.info(
+        "try to delete the timeline, this should cancel size computation tasks and wait for them to finish"
+    )
+    env.pageserver.allowed_errors.append(
+        f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
+    )
+    delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
+
+    def delete_timeline_thread_fn():
+        try:
+            client.tenant_detach(tenant_id)
+            delete_timeline_success.put(True)
+        except PageserverApiException:
+            delete_timeline_success.put(False)
+            raise
+
+    delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn)
+    delete_timeline_thread.start()
+    # give it some time to settle in the state where it waits for size computation task
+    time.sleep(5)
+    assert (
+        not delete_timeline_success.empty()
+    ), "delete timeline should be stuck waiting for size computation task"
+
+    log.info(
+        "resume the size calculation. The failpoint checks that the timeline directory still exists."
+    )
+    client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return"))
+    client.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+    log.info("wait for delete timeline thread to finish and assert that it succeeded")
+    assert delete_timeline_success.get()
+
+    # if the implementation is incorrect, the teardown would complain about an error log
+    # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
+
+
 def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
     env = neon_simple_env
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init")

From 38ebd6e7a00bb37b221095158e3b79c6a33ba5b5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 15:16:25 +0100
Subject: [PATCH 025/132] [3/4] make initial size estimation task sensitive to
 task_mgr shutdown requests

This exacerbates the problem pointed out in the previous commit.
Why? Because with this patch, deleting a timeline also exposes the issue.

Extend the test to expose the problem.
---
 pageserver/src/tenant/timeline.rs         |  3 +++
 test_runner/regress/test_timeline_size.py | 27 +++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e957878472..b7f12609e6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1290,6 +1290,9 @@ impl Timeline {
                 }
                 Ok(())
             },
+            _ = task_mgr::shutdown_watcher() => {
+                anyhow::bail!("aborted because task_mgr shutdown requested");
+            }
             new_event = async {
                 loop {
                     match timeline_state_updates.changed().await {
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index e881608a44..38660cefac 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -9,6 +9,7 @@ from pathlib import Path
 
 import psycopg2.errors
 import psycopg2.extras
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -216,7 +217,10 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
     ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value"
 
 
-def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_method", ["tenant_detach", "timeline_delete"])
+def test_timeline_initial_logical_size_calculation_cancellation(
+    neon_env_builder: NeonEnvBuilder, deletion_method: str
+):
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
 
@@ -259,16 +263,20 @@ def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder
     assert_size_calculation_not_done()
 
     log.info(
-        "try to delete the timeline, this should cancel size computation tasks and wait for them to finish"
-    )
-    env.pageserver.allowed_errors.append(
-        f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
+        f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
     )
+    if deletion_method == "timeline_delete":
+        env.pageserver.allowed_errors.append(
+            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
+        )
     delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
 
     def delete_timeline_thread_fn():
         try:
-            client.tenant_detach(tenant_id)
+            if deletion_method == "tenant_detach":
+                client.tenant_detach(tenant_id)
+            elif deletion_method == "timeline_delete":
+                client.timeline_delete(tenant_id, timeline_id)
             delete_timeline_success.put(True)
         except PageserverApiException:
             delete_timeline_success.put(False)
@@ -278,9 +286,10 @@ def test_timeline_initial_logical_size_calculation_cancellation(neon_env_builder
     delete_timeline_thread.start()
     # give it some time to settle in the state where it waits for size computation task
     time.sleep(5)
-    assert (
-        not delete_timeline_success.empty()
-    ), "delete timeline should be stuck waiting for size computation task"
+    if not delete_timeline_success.empty():
+        assert (
+            False
+        ), f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}"
 
     log.info(
         "resume the size calculation. The failpoint checks that the timeline directory still exists."

From 7db018e1477e721802d603496f4722012423aa7a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 15 Dec 2022 17:20:38 +0100
Subject: [PATCH 026/132] [4/4] the fix: do not leak spawn_blocking() tasks
 from logical size calculation code

- Refactor logical_size_calculation_task, moving the pieces that are
  specific to try_spawn_size_init_task into that function.
  This allows us to spawn additional size calculation tasks that are not
  init size calculation tasks.

  - As part of this refactoring, stop logging cancellations as errors.
    They are part of regular operations.
    Logging them as errors was inadvertently introduced in earlier commit

      427c1b2e9661161439e65aabc173d695cfc03ab4
      initial logical size calculation: if it fails, retry on next call

- Change tenant size model request code to spawn task_mgr tasks using
  the refactored logical_size_calculation_task function.
  Using a task_mgr task ensures that the calculation cannot outlive
  the timeline.
  - There are presumably still some subtle race conditions if a size
    requests comes in at exactly the same time as a detach / delete
    request.
  - But that's the concern of diferent area of the code (e.g., tenant_mgr)
    and requires holistic solutions, such as the proposed TenantGuard.

- Make size calculation cancellable using CancellationToken.
  This is more of a cherry on top.
  NB: the test code doesn't use this because we _must_ return from
  the failpoint, because the failpoint lib doesn't allow to just
  continue execution in combination with executing the closure.

This commit fixes the tests introduced earlier in this patch series.
---
 pageserver/src/http/routes.rs             |   9 +-
 pageserver/src/pgdatadir_mapping.rs       |  22 ++-
 pageserver/src/tenant/size.rs             |  59 ++++---
 pageserver/src/tenant/timeline.rs         | 202 ++++++++++++++--------
 test_runner/regress/test_timeline_size.py |   4 -
 5 files changed, 187 insertions(+), 109 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 40d2a0e0ef..68a26b8098 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4,6 +4,7 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use remote_storage::GenericRemoteStorage;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use super::models::{
@@ -86,8 +87,14 @@ fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
     let mut info = build_timeline_info_common(timeline)?;
     if include_non_incremental_logical_size {
+        // XXX we should be using spawn_ondemand_logical_size_calculation here.
+        // Otherwise, if someone deletes the timeline / detaches the tenant while
+        // we're executing this function, we will outlive the timeline on-disk state.
         info.current_logical_size_non_incremental =
-            Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
+            Some(timeline.get_current_logical_size_non_incremental(
+                info.last_record_lsn,
+                CancellationToken::new(),
+            )?);
     }
     if include_non_incremental_physical_size {
         info.current_physical_size_non_incremental =
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0e334a63df..797ee9f436 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,7 +10,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, ensure, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -19,6 +19,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::Range;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -33,6 +34,14 @@ pub enum LsnForTimestamp {
     NoData(Lsn),
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum CalculateLogicalSizeError {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 ///
 /// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
@@ -376,14 +385,21 @@ impl Timeline {
     ///
     /// Only relation blocks are counted currently. That excludes metadata,
     /// SLRUs, twophase files etc.
-    pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
+    pub fn get_current_logical_size_non_incremental(
+        &self,
+        lsn: Lsn,
+        cancel: CancellationToken,
+    ) -> std::result::Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
             for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
+                if cancel.is_cancelled() {
+                    return Err(CalculateLogicalSizeError::Cancelled);
+                }
                 let relsize_key = rel_size_to_key(rel);
                 let mut buf = self.get(relsize_key, lsn)?;
                 let relsize = buf.get_u32_le();
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 24d9b2a10e..597461ce29 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use anyhow::Context;
+use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+
 use super::Tenant;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -212,11 +215,30 @@ pub(super) async fn gather_inputs(
     let mut have_any_error = false;
 
     while let Some(res) = joinset.join_next().await {
-        // each of these come with Result<Result<_, JoinError>, JoinError>
+        // each of these come with Result<anyhow::Result<_>, JoinError>
         // because of spawn + spawn_blocking
-        let res = res.and_then(|inner| inner);
         match res {
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures, nor should be");
+            }
+            Err(join_error) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
+                have_any_error = true;
+            }
+            Ok(Err(recv_result_error)) => {
+                // cannot really do anything, as this panic is likely a bug
+                error!("failed to receive logical size query result: {recv_result_error:#}");
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
+                have_any_error = true;
+            }
+            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                 debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
 
                 logical_size_cache.insert((timeline.timeline_id, lsn), size);
@@ -228,21 +250,6 @@ pub(super) async fn gather_inputs(
                     command: Command::Update(size),
                 });
             }
-            Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
-                warn!(
-                    timeline_id=%timeline.timeline_id,
-                    "failed to calculate logical size at {lsn}: {error:#}"
-                );
-                have_any_error = true;
-            }
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("we are not cancelling any of the futures, nor should be");
-            }
-            Err(join_error) => {
-                // cannot really do anything, as this panic is likely a bug
-                error!("logical size query panicked: {join_error:#}");
-                have_any_error = true;
-            }
         }
     }
 
@@ -351,7 +358,7 @@ enum LsnKind {
 struct TimelineAtLsnSizeResult(
     Arc<crate::tenant::Timeline>,
     utils::lsn::Lsn,
-    anyhow::Result<u64>,
+    Result<u64, CalculateLogicalSizeError>,
 );
 
 #[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
@@ -359,17 +366,15 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
-) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
-    let permit = tokio::sync::Semaphore::acquire_owned(limit)
+) -> Result<TimelineAtLsnSizeResult, RecvError> {
+    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
-    tokio::task::spawn_blocking(move || {
-        let _permit = permit;
-        let size_res = timeline.calculate_logical_size(lsn);
-        TimelineAtLsnSizeResult(timeline, lsn, size_res)
-    })
-    .await
+    let size_res = timeline
+        .spawn_ondemand_logical_size_calculation(lsn)
+        .await?;
+    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }
 
 #[test]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b7f12609e6..3373c52231 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6,8 +6,9 @@ use fail::fail_point;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::models::TimelineState;
-use tokio::sync::watch;
+use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio::task::spawn_blocking;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 use std::cmp::{max, min, Ordering};
@@ -36,9 +37,9 @@ use crate::tenant::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::TimelineMetrics;
-use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
+use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
 use crate::tenant_config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 
@@ -176,7 +177,6 @@ pub struct Timeline {
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
-    initial_size_computation_state: Mutex<InitialLogicalSizeComputationState>,
 
     /// Information about the last processed message by the WAL receiver,
     /// or None if WAL receiver has not received anything for this timeline
@@ -189,14 +189,6 @@ pub struct Timeline {
     state: watch::Sender<TimelineState>,
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-enum InitialLogicalSizeComputationState {
-    NotStarted,
-    Running,
-    FailedWillRetryNextTime,
-    Success,
-}
-
 /// Internal structure to hold all data needed for logical size calculation.
 /// Calculation consists of two parts:
 /// 1.  Initial size calculation. That might take a long time, because it requires
@@ -210,6 +202,8 @@ struct LogicalSize {
     ///
     /// NOTE: initial size is not a constant and will change between restarts.
     initial_logical_size: OnceCell<u64>,
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    initial_size_computation: Arc<tokio::sync::Semaphore>,
     /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
     initial_part_end: Option<Lsn>,
     /// All other size changes after startup, combined together.
@@ -260,6 +254,8 @@ impl LogicalSize {
     fn empty_initial() -> Self {
         Self {
             initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
             initial_part_end: None,
             size_added_after_initial: AtomicI64::new(0),
         }
@@ -268,6 +264,7 @@ impl LogicalSize {
     fn deferred_initial(compute_to: Lsn) -> Self {
         Self {
             initial_logical_size: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
             initial_part_end: Some(compute_to),
             size_added_after_initial: AtomicI64::new(0),
         }
@@ -812,9 +809,6 @@ impl Timeline {
                 // initial logical size is 0.
                 LogicalSize::empty_initial()
             },
-            initial_size_computation_state: Mutex::new(
-                InitialLogicalSizeComputationState::NotStarted,
-            ),
             partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
             repartition_threshold: 0,
 
@@ -1231,13 +1225,21 @@ impl Timeline {
     }
 
     fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
-        use InitialLogicalSizeComputationState::*;
-        let mut guard = self.initial_size_computation_state.lock().unwrap();
-        match *guard {
-            Running | Success => return,
-            NotStarted | FailedWillRetryNextTime => *guard = Running,
-        }
-        drop(guard);
+        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
+            .try_acquire_owned()
+        {
+            Ok(permit) => permit,
+            Err(TryAcquireError::NoPermits) => {
+                // computation already ongoing or finished with success
+                return;
+            }
+            Err(TryAcquireError::Closed) => unreachable!("we never call close"),
+        };
+        debug_assert!(self
+            .current_logical_size
+            .initial_logical_size
+            .get()
+            .is_none());
         // We need to start the computation task.
         let self_clone = Arc::clone(self);
         task_mgr::spawn(
@@ -1247,79 +1249,131 @@ impl Timeline {
             Some(self.timeline_id),
             "initial size calculation",
             false,
+            // NB: don't log errors here, task_mgr will do that.
             async move {
-                let res = self_clone
-                    .initial_logical_size_calculation_task(init_lsn)
-                    .await;
-                // task_mgr will log the result
-                let new_state = match res {
-                    Ok(_) => Success,
-                    Err(_) => FailedWillRetryNextTime,
+                let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await
+                {
+                    Ok(s) => s,
+                    Err(CalculateLogicalSizeError::Cancelled) => {
+                        // Don't make noise, this is a common task.
+                        // In the unlikely case that there ihs another call to this function, we'll retry
+                        // because initial_logical_size is still None.
+                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
+                        return Ok(());
+                    }
+                    x @ Err(_) => x.context("Failed to calculate logical size")?,
                 };
-                let mut state = self_clone.initial_size_computation_state.lock().unwrap();
-                if *state != Running {
-                    // Should be unreachable, but no reason to crash the pageserver. Don't touch anything.
-                    error!("expecting initial size computation task to be in state {Running:?}, got {state:?}")
-                } else {
-                    *state = new_state;
-                }
-                res
-            },
-        );
-    }
-
-    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
-    async fn initial_logical_size_calculation_task(
-        self: &Arc<Self>,
-        init_lsn: Lsn,
-    ) -> anyhow::Result<()> {
-        let mut timeline_state_updates = self.subscribe_for_state_updates();
-        let self_calculation = Arc::clone(self);
-        tokio::select! {
-            calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => {
-                let calculated_size = calculation_result
-                    .context("Failed to spawn calculation result task")?
-                    .context("Failed to calculate logical size")?;
-                match self.current_logical_size.initial_logical_size.set(calculated_size) {
+                match self_clone
+                    .current_logical_size
+                    .initial_logical_size
+                    .set(calculated_size)
+                {
                     Ok(()) => (),
                     Err(existing_size) => {
-                        // This shouldn't happen because we use self.initial_size_computation_running to ensure exlusivity here.
+                        // This shouldn't happen because the semaphore is initialized with 1.
                         // But if it happens, just complain & report success so there are no further retries.
                         error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
                     }
                 }
+                // now that `initial_logical_size.is_some()`, reduce permit count to 0
+                // so that we prevent future callers from spawning this task
+                permit.forget();
                 Ok(())
             },
-            _ = task_mgr::shutdown_watcher() => {
-                anyhow::bail!("aborted because task_mgr shutdown requested");
-            }
-            new_event = async {
-                loop {
-                    match timeline_state_updates.changed().await {
-                        Ok(()) => {
-                            let new_state = *timeline_state_updates.borrow();
-                            match new_state {
-                                // we're running this job for active timelines only
-                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state),
+        );
+    }
+
+    pub fn spawn_ondemand_logical_size_calculation(
+        self: &Arc<Self>,
+        lsn: Lsn,
+    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
+        let (sender, receiver) = oneshot::channel();
+        let self_clone = Arc::clone(self);
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "ondemand logical size calculation",
+            false,
+            async move {
+                let res = self_clone.logical_size_calculation_task(lsn).await;
+                let _ = sender.send(res).ok();
+                Ok(()) // Receiver is responsible for handling errors
+            },
+        );
+        receiver
+    }
+
+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    async fn logical_size_calculation_task(
+        self: &Arc<Self>,
+        init_lsn: Lsn,
+    ) -> Result<u64, CalculateLogicalSizeError> {
+        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let self_calculation = Arc::clone(self);
+        let cancel = CancellationToken::new();
+
+        let calculation = async {
+            let cancel = cancel.child_token();
+            spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn, cancel))
+                .await
+                .context("Failed to spawn calculation result task")?
+        };
+        let timeline_state_cancellation = async {
+            loop {
+                match timeline_state_updates.changed().await {
+                    Ok(()) => {
+                        let new_state = *timeline_state_updates.borrow();
+                        match new_state {
+                            // we're running this job for active timelines only
+                            TimelineState::Active => continue,
+                            TimelineState::Broken
+                            | TimelineState::Stopping
+                            | TimelineState::Suspended => {
+                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
                             }
                         }
-                        Err(_sender_dropped_error) => return None,
+                    }
+                    Err(_sender_dropped_error) => {
+                        // can't happen, the sender is not dropped as long as the Timeline exists
+                        break "aborted because state watch was dropped".to_string();
                     }
                 }
-            } => {
-                match new_event {
-                    Some(new_state) => anyhow::bail!("aborted because timeline became inactive (new state: {new_state:?})"),
-                    None => anyhow::bail!("aborted because state watch was dropped"), // can't happen, the sender is not dropped as long as the Timeline exists
+            }
+        };
+
+        let taskmgr_shutdown_cancellation = async {
+            task_mgr::shutdown_watcher().await;
+            "aborted because task_mgr shutdown requested".to_string()
+        };
+
+        tokio::pin!(calculation);
+        loop {
+            tokio::select! {
+                res = &mut calculation =>  { return res }
+                reason = timeline_state_cancellation => {
+                    debug!(reason = reason, "cancelling calculation");
+                    cancel.cancel();
+                    return calculation.await;
                 }
-            },
+                reason = taskmgr_shutdown_cancellation => {
+                    debug!(reason = reason, "cancelling calculation");
+                    cancel.cancel();
+                    return calculation.await;
+                }
+            }
         }
     }
 
     /// Calculate the logical size of the database at the latest LSN.
     ///
     /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
+    pub fn calculate_logical_size(
+        &self,
+        up_to_lsn: Lsn,
+        cancel: CancellationToken,
+    ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
             self.timeline_id, up_to_lsn
@@ -1360,7 +1414,7 @@ impl Timeline {
         } else {
             self.metrics.logical_size_histo.start_timer()
         };
-        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
+        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn, cancel)?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
         Ok(logical_size)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 38660cefac..523c946a68 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -265,10 +265,6 @@ def test_timeline_initial_logical_size_calculation_cancellation(
     log.info(
         f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish"
     )
-    if deletion_method == "timeline_delete":
-        env.pageserver.allowed_errors.append(
-            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*aborted because task_mgr shutdown requested"
-        )
     delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1)
 
     def delete_timeline_thread_fn():

From 49a211c98a357543ab78a320466787487f9d7ed5 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 19 Dec 2022 15:38:41 +0200
Subject: [PATCH 027/132] Add neon_local test

---
 control_plane/src/bin/neon_local.rs        |  1 +
 test_runner/fixtures/neon_fixtures.py      |  6 ++++++
 test_runner/regress/test_neon_local_cli.py | 10 ++++++++++
 3 files changed, 17 insertions(+)
 create mode 100644 test_runner/regress/test_neon_local_cli.py

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f0c3b983f0..61b9445c6d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -900,6 +900,7 @@ fn cli() -> Command {
     let stop_mode_arg = Arg::new("stop-mode")
         .short('m')
         .value_parser(["fast", "immediate"])
+        .default_value("fast")
         .help("If 'immediate', don't flush repository data at shutdown")
         .required(false)
         .value_name("stop-mode");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3a3ee94425..b3e4809f24 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1742,6 +1742,12 @@ class NeonCli(AbstractNeonCli):
 
         return self.raw_cli(args, check_return_code=check_return_code)
 
+    def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
+        return self.raw_cli(["start"], check_return_code=check_return_code)
+
+    def stop(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
+        return self.raw_cli(["stop"], check_return_code=check_return_code)
+
 
 class WalCraft(AbstractNeonCli):
     """
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
new file mode 100644
index 0000000000..6c7cdb6f7f
--- /dev/null
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -0,0 +1,10 @@
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+# Test that neon cli is able to start and stop all processes with the user defaults.
+# def test_neon_cli_basics(neon_simple_env: NeonEnv):
+def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init()
+
+    env.neon_cli.start()
+    env.neon_cli.stop()

From 9ddd1d75225afed164f79809e1575121409ed69d Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 19 Dec 2022 16:41:26 +0200
Subject: [PATCH 028/132] Stop all storage nodes on startup failure

---
 control_plane/src/bin/neon_local.rs | 48 ++++++++++++++++-------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 61b9445c6d..53fd3100c7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -747,7 +747,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
     if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) {
         Ok(SafekeeperNode::from_env(env, node))
     } else {
-        bail!("could not find safekeeper '{}'", id)
+        bail!("could not find safekeeper {id}")
     }
 }
 
@@ -806,22 +806,22 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
 }
 
 fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    broker::start_broker_process(env)?;
-    let pageserver = PageServerNode::from_env(env);
-
     // Postgres nodes are not started automatically
 
+    broker::start_broker_process(env)?;
+
+    let pageserver = PageServerNode::from_env(env);
     if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver start failed: {e}");
-        try_stop_storage_broker_process(env);
+        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
+        try_stop_all(env, true);
         exit(1);
     }
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start() {
-            eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id);
-            try_stop_storage_broker_process(env);
+            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
+            try_stop_all(env, false);
             exit(1);
         }
     }
@@ -832,35 +832,41 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
 
+    try_stop_all(env, immediate);
+
+    Ok(())
+}
+
+fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     let pageserver = PageServerNode::from_env(env);
 
     // Stop all compute nodes
-    let cplane = ComputeControlPlane::load(env.clone())?;
-    for (_k, node) in cplane.nodes {
-        if let Err(e) = node.stop(false) {
-            eprintln!("postgres stop failed: {}", e);
+    match ComputeControlPlane::load(env.clone()) {
+        Ok(cplane) => {
+            for (_k, node) in cplane.nodes {
+                if let Err(e) = node.stop(false) {
+                    eprintln!("postgres stop failed: {e:#}");
+                }
+            }
+        }
+        Err(e) => {
+            eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}")
         }
     }
 
     if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver stop failed: {}", e);
+        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
     }
 
     for node in env.safekeepers.iter() {
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.stop(immediate) {
-            eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e);
+            eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e);
         }
     }
 
-    try_stop_storage_broker_process(env);
-
-    Ok(())
-}
-
-fn try_stop_storage_broker_process(env: &local_env::LocalEnv) {
     if let Err(e) = broker::stop_broker_process(env) {
-        eprintln!("neon broker stop failed: {e}");
+        eprintln!("neon broker stop failed: {e:#}");
     }
 }
 

From 3735aece562902ec3ff045a0f4e4f70662090bff Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 19 Dec 2022 16:42:01 +0200
Subject: [PATCH 029/132] Safekeeper: Always use workdir as a full path

---
 safekeeper/src/bin/safekeeper.rs | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 275253d1d4..5ad88276e8 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -129,17 +129,22 @@ fn main() -> anyhow::Result<()> {
     logging::init(LogFormat::from_config(&args.log_format)?)?;
     info!("version: {GIT_VERSION}");
 
+    let args_workdir = &args.datadir;
+    let workdir = args_workdir.canonicalize().with_context(|| {
+        format!("Failed to get the absolute path for input workdir {args_workdir:?}")
+    })?;
+
     // Change into the data directory.
-    std::env::set_current_dir(&args.datadir)?;
+    std::env::set_current_dir(&workdir)?;
 
     // Set or read our ID.
-    let id = set_id(&args.datadir, args.id.map(NodeId))?;
+    let id = set_id(&workdir, args.id.map(NodeId))?;
     if args.init {
         return Ok(());
     }
 
     let conf = SafeKeeperConf {
-        workdir: args.datadir,
+        workdir,
         my_id: id,
         listen_pg_addr: args.listen_pg,
         listen_http_addr: args.listen_http,
@@ -308,7 +313,8 @@ fn set_id(workdir: &Path, given_id: Option<NodeId>) -> Result<NodeId> {
                 } else {
                     bail!("safekeeper id is not specified");
                 };
-                let mut f = File::create(&id_file_path)?;
+                let mut f = File::create(&id_file_path)
+                    .with_context(|| format!("Failed to create id file at {id_file_path:?}"))?;
                 f.write_all(my_id.to_string().as_bytes())?;
                 f.sync_all()?;
                 info!("initialized safekeeper id {}", my_id);

From 39f58038d1a03fd309ed0494b9cbcc1bef99bbef Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 23:58:24 +0200
Subject: [PATCH 030/132] Don't upload index file in compaction, if there was
 nothing to do. (#3149)

This splits the storage_sync2::schedule_index_file into two (public)
functions:
1. `schedule_index_upload_for_metadata_update`, for when the metadata
(e.g. disk_consistent_lsn or last_gc_cutoff) has changed, and

2. `schedule_index_upload_for_file_changes`, for when layer file uploads
or deletions have been scheduled.

We now keep track of whether there have been any uploads or deletions
since the last index-file upload, and skip the upload in
`schedule_index_upload_for_file_changes` if there haven't been any
changes. That allows us to call the function liberally in timeline.rs,
whenever layer file uploads or deletions might've been scheduled,
without starting a lot of unnecessary index file uploads.

GC was covered earlier by commit c262390214, but that missed that we
have the same problem with compaction.
---
 pageserver/src/storage_sync2.rs           | 103 ++++++++++++++++------
 pageserver/src/storage_sync2/index.rs     |  14 ++-
 pageserver/src/tenant/timeline.rs         |  29 ++++--
 test_runner/regress/test_gc_aggressive.py |   5 ++
 4 files changed, 114 insertions(+), 37 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 89bbc34227..14763985ab 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -32,7 +32,8 @@
 //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
 //!
 //! - [`RemoteTimelineClient::schedule_layer_file_upload`]  when we've created a new layer file.
-//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
+//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
 //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
 //!
 //! Internally, these functions create [`UploadOp`]s and put them in a queue.
@@ -290,6 +291,10 @@ struct UploadQueueInitialized {
     /// in-progress and queued operations
     latest_files: HashMap<LayerFileName, LayerFileMetadata>,
 
+    /// How many file uploads or deletions been scheduled, since the
+    /// last (scheduling of) metadata index upload?
+    latest_files_changes_since_metadata_upload_scheduled: u64,
+
     /// Metadata stored in the remote storage, taking into account all
     /// in-progress and queued operations.
     /// DANGER: do not return to outside world, e.g., safekeepers.
@@ -339,6 +344,7 @@ impl UploadQueue {
         let state = UploadQueueInitialized {
             // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
             latest_files: HashMap::new(),
+            latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
             // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
             // safekeepers from garbage-collecting anything.
@@ -385,6 +391,7 @@ impl UploadQueue {
 
         let state = UploadQueueInitialized {
             latest_files: files,
+            latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part_metadata.clone(),
             last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
             // what follows are boring default initializations
@@ -558,7 +565,9 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
             if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
-                upgraded.merge(&new_metadata);
+                if upgraded.merge(&new_metadata) {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                }
                 // If we don't do an index file upload inbetween here and restart,
                 // the value will go back down after pageserver restart, since we will
                 // have lost this data point.
@@ -583,14 +592,20 @@ impl RemoteTimelineClient {
     //
 
     ///
-    /// Launch an index-file upload operation in the background.
+    /// Launch an index-file upload operation in the background, with
+    /// updated metadata.
     ///
     /// The upload will be added to the queue immediately, but it
     /// won't be performed until all previosuly scheduled layer file
     /// upload operations have completed successfully.  This is to
     /// ensure that when the index file claims that layers X, Y and Z
-    /// exist in remote storage, they really do.
-    pub fn schedule_index_upload(
+    /// exist in remote storage, they really do. To wait for the upload
+    /// to complete, use `wait_completion`.
+    ///
+    /// If there were any changes to the list of files, i.e. if any
+    /// layer file uploads were scheduled, since the last index file
+    /// upload, those will be included too.
+    pub fn schedule_index_upload_for_metadata_update(
         self: &Arc<Self>,
         metadata: &TimelineMetadata,
     ) -> anyhow::Result<()> {
@@ -601,26 +616,60 @@ impl RemoteTimelineClient {
         // ahead of what's _actually_ on the remote during index upload.
         upload_queue.latest_metadata = metadata.clone();
 
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        self.schedule_index_upload(upload_queue, metadata_bytes);
+
+        Ok(())
+    }
+
+    ///
+    /// Launch an index-file upload operation in the background, if necessary.
+    ///
+    /// Use this function to schedule the update of the index file after
+    /// scheduling file uploads or deletions. If no file uploads or deletions
+    /// have been scheduled since the last index file upload, this does
+    /// nothing.
+    ///
+    /// Like schedule_index_upload_for_metadata_update(), this merely adds
+    /// the upload to the upload queue and returns quickly.
+    pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+            self.schedule_index_upload(upload_queue, metadata_bytes);
+        }
+
+        Ok(())
+    }
+
+    /// Launch an index-file upload operation in the background (internal function)
+    fn schedule_index_upload(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        metadata_bytes: Vec<u8>,
+    ) {
+        info!(
+            "scheduling metadata upload with {} files ({} changed)",
+            upload_queue.latest_files.len(),
+            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
+        );
+
         let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
 
         let index_part = IndexPart::new(
             upload_queue.latest_files.clone(),
             disk_consistent_lsn,
-            upload_queue.latest_metadata.to_bytes()?,
+            metadata_bytes,
         );
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
         self.update_upload_queue_unfinished_metric(1, &op);
         upload_queue.queued_operations.push_back(op);
-
-        info!(
-            "scheduled metadata upload with {} files",
-            upload_queue.latest_files.len()
-        );
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
-
-        Ok(())
     }
 
     ///
@@ -644,6 +693,7 @@ impl RemoteTimelineClient {
         upload_queue
             .latest_files
             .insert(layer_file_name.clone(), layer_metadata.clone());
+        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
         self.update_upload_queue_unfinished_metric(1, &op);
@@ -662,8 +712,11 @@ impl RemoteTimelineClient {
     ///
     /// Launch a delete operation in the background.
     ///
-    /// The deletion won't actually be performed, until all preceding
-    /// upload operations have completed succesfully.
+    /// Note: This schedules an index file upload before the deletions.  The
+    /// deletion won't actually be performed, until any previously scheduled
+    /// upload operations, and the index file upload, have completed
+    /// succesfully.
+    ///
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
         names: &[LayerFileName],
@@ -674,7 +727,6 @@ impl RemoteTimelineClient {
         // Deleting layers doesn't affect the values stored in TimelineMetadata,
         // so we don't need update it. Just serialize it.
         let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
-        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
 
         // Update the remote index file, removing the to-be-deleted files from the index,
         // before deleting the actual files.
@@ -686,16 +738,12 @@ impl RemoteTimelineClient {
         let no_bail_here = || {
             for name in names {
                 upload_queue.latest_files.remove(name);
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
             }
 
-            let index_part = IndexPart::new(
-                upload_queue.latest_files.clone(),
-                disk_consistent_lsn,
-                metadata_bytes,
-            );
-            let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-            self.update_upload_queue_unfinished_metric(1, &op);
-            upload_queue.queued_operations.push_back(op);
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+                self.schedule_index_upload(upload_queue, metadata_bytes);
+            }
 
             // schedule the actual deletions
             for name in names {
@@ -1244,15 +1292,19 @@ mod tests {
             assert!(upload_queue.queued_operations.is_empty());
             assert!(upload_queue.inprogress_tasks.len() == 2);
             assert!(upload_queue.num_inprogress_layer_uploads == 2);
+
+            // also check that `latest_file_changes` was updated
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
         }
 
         // Schedule upload of index. Check that it is queued
         let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload(&metadata)?;
+        client.schedule_index_upload_for_metadata_update(&metadata)?;
         {
             let mut guard = client.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut().unwrap();
             assert!(upload_queue.queued_operations.len() == 1);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
         }
 
         // Wait for the uploads to finish
@@ -1288,6 +1340,7 @@ mod tests {
             assert!(upload_queue.inprogress_tasks.len() == 1);
             assert!(upload_queue.num_inprogress_layer_uploads == 1);
             assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
         }
         assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);
 
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/storage_sync2/index.rs
index ed4ed10189..bb58a34969 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -48,9 +48,17 @@ impl LayerFileMetadata {
     /// Metadata has holes due to version upgrades. This method is called to upgrade self with the
     /// other value.
     ///
-    /// This is called on the possibly outdated version.
-    pub fn merge(&mut self, other: &Self) {
-        self.file_size = other.file_size.or(self.file_size);
+    /// This is called on the possibly outdated version. Returns true if any changes
+    /// were made.
+    pub fn merge(&mut self, other: &Self) -> bool {
+        let mut changed = false;
+
+        if self.file_size != other.file_size {
+            self.file_size = other.file_size.or(self.file_size);
+            changed = true;
+        }
+
+        changed
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3373c52231..b1f580c32f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -589,6 +589,18 @@ impl Timeline {
                 let timer = self.metrics.compact_time_histo.start_timer();
                 self.compact_level0(target_file_size).await?;
                 timer.stop_and_record();
+
+                // If `create_image_layers' or `compact_level0` scheduled any
+                // uploads or deletions, but didn't update the index file yet,
+                // do it now.
+                //
+                // This isn't necessary for correctness, the remote state is
+                // consistent without the uploads and deletions, and we would
+                // update the index file on next flush iteration too. But it
+                // could take a while until that happens.
+                if let Some(remote_client) = &self.remote_client {
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -1215,9 +1227,7 @@ impl Timeline {
             remote_client
                 .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
         }
-        if !local_only_layers.is_empty() {
-            remote_client.schedule_index_upload(up_to_date_metadata)?;
-        }
+        remote_client.schedule_index_upload_for_file_changes()?;
 
         info!("Done");
 
@@ -1923,13 +1933,9 @@ impl Timeline {
 
         if let Some(remote_client) = &self.remote_client {
             for (path, layer_metadata) in layer_paths_to_upload {
-                remote_client
-                    .schedule_layer_file_upload(&path, &layer_metadata)
-                    .context("schedule_layer_file_upload")?;
+                remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
             }
-            remote_client
-                .schedule_index_upload(&metadata)
-                .context("schedule_layer_file_upload")?;
+            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
         }
 
         Ok(())
@@ -2398,6 +2404,11 @@ impl Timeline {
             deltas_to_compact,
         } = self.compact_level0_phase1(target_file_size).await?;
 
+        if new_layers.is_empty() && deltas_to_compact.is_empty() {
+            // nothing to do
+            return Ok(());
+        }
+
         // Before deleting any layers, we need to wait for their upload ops to finish.
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index b9d012fa36..5f052bf81a 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -165,6 +165,11 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
         cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')")
         pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32)
         num_index_uploads = get_num_remote_ops("index", "upload")
+
+        # Also make sure that a no-op compaction doesn't upload the index
+        # file unnecessarily.
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
         log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
 
     after = num_index_uploads

From f9f57e211a69e8f67058dcb6841b59c7a5743924 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 01:55:59 +0200
Subject: [PATCH 031/132] Use local brokers

---
 test_runner/fixtures/neon_fixtures.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b3e4809f24..59dd21f84c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,7 +286,7 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def default_broker(
     request: FixtureRequest,
     port_distributor: PortDistributor,
@@ -296,9 +296,8 @@ def default_broker(
     # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
     broker_logfile = (
-        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
+        get_test_repo_dir(request, top_output_dir) / f"storage_broker.log"
     )
-    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
 
     broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
@@ -1012,7 +1011,7 @@ def _shared_simple_env(
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
         # Create the environment in the per-test output directory
-        repo_dir = get_test_output_dir(request, top_output_dir) / "repo"
+        repo_dir = get_test_repo_dir(request, top_output_dir)
     else:
         # We're running shared fixtures. Share a single directory.
         repo_dir = top_output_dir / "shared_repo"
@@ -2791,6 +2790,9 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
+def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
+    return get_test_output_dir(request, top_output_dir) / "repo"
+
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--preserve-database-files",

From 56d8c25dc86145a3d2c72028d83b037eb673d9c0 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 01:57:36 +0200
Subject: [PATCH 032/132] Revert "Use local brokers"

This reverts commit f9f57e211a69e8f67058dcb6841b59c7a5743924.
---
 test_runner/fixtures/neon_fixtures.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 59dd21f84c..b3e4809f24 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,7 +286,7 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
-@pytest.fixture(scope="function")
+@pytest.fixture(scope="session")
 def default_broker(
     request: FixtureRequest,
     port_distributor: PortDistributor,
@@ -296,8 +296,9 @@ def default_broker(
     # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
     broker_logfile = (
-        get_test_repo_dir(request, top_output_dir) / f"storage_broker.log"
+        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
     )
+    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
 
     broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
@@ -1011,7 +1012,7 @@ def _shared_simple_env(
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
         # Create the environment in the per-test output directory
-        repo_dir = get_test_repo_dir(request, top_output_dir)
+        repo_dir = get_test_output_dir(request, top_output_dir) / "repo"
     else:
         # We're running shared fixtures. Share a single directory.
         repo_dir = top_output_dir / "shared_repo"
@@ -2790,9 +2791,6 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
-def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
-    return get_test_output_dir(request, top_output_dir) / "repo"
-
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--preserve-database-files",

From 6ac9ecb074aceb7126666f29e45010fd7efc3dad Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 20:50:02 +0200
Subject: [PATCH 033/132] Remove a few unnecessary checkpoint calls from unit
 tests.

The `make_some_layers' function performs a checkpoint already.
---
 pageserver/src/tenant.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0e59b43dda..64e214c5a2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2993,7 +2993,6 @@ mod tests {
                 .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
                 .initialize()?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
         }
 
         let tenant = harness.load().await;
@@ -3016,7 +3015,6 @@ mod tests {
                 .initialize()?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
 
             tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
 
@@ -3025,7 +3023,6 @@ mod tests {
                 .expect("Should have a local timeline");
 
             make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
         }
 
         // check that both of them are initially unloaded

From 7b0d28bbdce81cf470800aa27ef239b47116380c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 19 Dec 2022 21:09:41 +0200
Subject: [PATCH 034/132] Update outdated comment on Tenant::gc_iteration.

Commit 6dec85b19d remove the `checkpoint_before_gc` argument, but failed
to update the comment. Remove its description, and while we're at it, try
to explain better how the `horizon` and `pitr` arguments are used.
---
 pageserver/src/tenant.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 64e214c5a2..edd7a3cb07 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1154,11 +1154,15 @@ impl Tenant {
     /// this function is periodically called by gc task.
     /// also it can be explicitly requested through page server api 'do_gc' command.
     ///
-    /// 'target_timeline_id' specifies the timeline to GC, or None for all.
-    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
-    /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
-    /// to make tests more deterministic.
-    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
+    /// `target_timeline_id` specifies the timeline to GC, or None for all.
+    ///
+    /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained.
+    /// Also known as the retention period, or the GC cutoff point. `horizon` specifies
+    /// the amount of history, as LSN difference from current latest LSN on each timeline.
+    /// `pitr` specifies the same as a time difference from the current time. The effective
+    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
+    /// requires more history to be retained.
+    //
     pub async fn gc_iteration(
         &self,
         target_timeline_id: Option<TimelineId>,

From cd7fdf2587625ba963ef36b0e1d878e505732988 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 20 Dec 2022 12:03:42 +0100
Subject: [PATCH 035/132] Remove neon-stress configs (#3121)

---
 .github/ansible/neon-stress.hosts.yaml        | 32 ----------
 .../neon-stress.neon-storage-broker.yaml      | 56 -----------------
 .../helm-values/neon-stress.proxy-scram.yaml  | 52 ----------------
 .github/helm-values/neon-stress.proxy.yaml    | 61 -------------------
 4 files changed, 201 deletions(-)
 delete mode 100644 .github/ansible/neon-stress.hosts.yaml
 delete mode 100644 .github/helm-values/neon-stress.neon-storage-broker.yaml
 delete mode 100644 .github/helm-values/neon-stress.proxy-scram.yaml
 delete mode 100644 .github/helm-values/neon-stress.proxy.yaml

diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml
deleted file mode 100644
index 5d5df5a6d5..0000000000
--- a/.github/ansible/neon-stress.hosts.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-storage:
-  vars:
-    bucket_name: neon-storage-ireland
-    bucket_region: eu-west-1
-    console_mgmt_base_url: http://neon-stress-console.local
-    broker_endpoint: http://storage-broker.neon-stress.local:50051
-    safekeeper_enable_s3_offload: 'false'
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: neon-stress/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-  children:
-    pageservers:
-      hosts:
-        neon-stress-ps-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-ps-2:
-          console_region_id: aws-eu-west-1
-    safekeepers:
-      hosts:
-        neon-stress-sk-1:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-2:
-          console_region_id: aws-eu-west-1
-        neon-stress-sk-3:
-          console_region_id: aws-eu-west-1
diff --git a/.github/helm-values/neon-stress.neon-storage-broker.yaml b/.github/helm-values/neon-stress.neon-storage-broker.yaml
deleted file mode 100644
index e11e5d4214..0000000000
--- a/.github/helm-values/neon-stress.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: neon-stress
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.neon-stress.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
deleted file mode 100644
index ed580349fc..0000000000
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-fullnameOverride: "neon-stress-proxy-scram"
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://neon-stress-console.local/management/api/v2"
-  domain: "*.stress.neon.tech"
-  sentryEnvironment: "development"
-
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
deleted file mode 100644
index 94270ced09..0000000000
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-fullnameOverride: "neon-stress-proxy"
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.dev.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: eu-west-1
-  zenith_region_slug: ireland
-
-service:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
-  type: LoadBalancer
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"

From 2c11f1fa95334b582372edb5a89aaeb1b779e6d7 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 13:06:21 +0200
Subject: [PATCH 036/132] Use separate broker per Python test (#3158)

And add its logs to Allure reports per test
---
 test_runner/fixtures/neon_fixtures.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b3e4809f24..2eabc25ef6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,24 +286,19 @@ def port_distributor(worker_base_port: int) -> PortDistributor:
     return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def default_broker(
-    request: FixtureRequest,
     port_distributor: PortDistributor,
-    top_output_dir: Path,
+    test_output_dir: Path,
     neon_binpath: Path,
 ) -> Iterator[NeonBroker]:
     # multiple pytest sessions could get launched in parallel, get them different ports/datadirs
     client_port = port_distributor.get_port()
-    broker_logfile = (
-        get_test_output_dir(request, top_output_dir) / f"storage_broker_{client_port}.log"
-    )
-    broker_logfile.parents[0].mkdir(exist_ok=True, parents=True)
+    broker_logfile = test_output_dir / "repo" / "storage_broker.log"
 
     broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath)
     yield broker
     broker.stop()
-    allure_attach_from_dir(Path(broker_logfile))
 
 
 @pytest.fixture(scope="session")
@@ -1012,7 +1007,7 @@ def _shared_simple_env(
 
     if os.environ.get("TEST_SHARED_FIXTURES") is None:
         # Create the environment in the per-test output directory
-        repo_dir = get_test_output_dir(request, top_output_dir) / "repo"
+        repo_dir = get_test_repo_dir(request, top_output_dir)
     else:
         # We're running shared fixtures. Share a single directory.
         repo_dir = top_output_dir / "shared_repo"
@@ -2791,6 +2786,10 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return test_dir
 
 
+def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
+    return get_test_output_dir(request, top_output_dir) / "repo"
+
+
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--preserve-database-files",

From eefb1d46f4837476aad2de872a2fcf873189a517 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 01:19:56 +0200
Subject: [PATCH 037/132] Replace Timeline::checkpoint with
 Timeline::freeze_and_flush

The new Timeline::freeze_and_flush function is equivalent to calling
Timeline::checkpoint(CheckpointConfig::Flush). There were only one
non-test caller that used CheckpointConfig::Forced, so replace that
with a call to the new Timeline::freeze_and_flush, followed by an
explicit call to Timeline::compact.

That only caller was to handle the mgmt API's 'checkpoint' endpoint.
Perhaps we should split that into separate 'flush' and 'compact'
endpoints too, but I didn't go that far yet.
---
 pageserver/src/http/routes.rs     |  8 ++--
 pageserver/src/lib.rs             |  9 ----
 pageserver/src/page_service.rs    |  3 +-
 pageserver/src/tenant.rs          | 68 +++++++++++++++----------------
 pageserver/src/tenant/timeline.rs | 19 ++-------
 pageserver/src/tenant_mgr.rs      |  2 +-
 6 files changed, 44 insertions(+), 65 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 68a26b8098..937a6144b6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -31,8 +31,6 @@ use utils::{
 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
 use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
-#[cfg(feature = "testing")]
-use crate::CheckpointConfig;
 
 struct State {
     conf: &'static PageServerConf,
@@ -777,7 +775,11 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
     timeline
-        .checkpoint(CheckpointConfig::Forced)
+        .freeze_and_flush()
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    timeline
+        .compact()
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index eafcaa88d9..5c4804db36 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -47,15 +47,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
 
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
-/// Config for the Repository checkpointer
-#[derive(Debug, Clone, Copy)]
-pub enum CheckpointConfig {
-    // Flush all in-memory data
-    Flush,
-    // Flush all in-memory data and reconstruct all page images
-    Forced,
-}
-
 pub async fn shutdown_pageserver(exit_code: i32) {
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 036fb14e9b..d9c19d04b7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -51,7 +51,6 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::{Tenant, Timeline};
 use crate::tenant_mgr;
 use crate::trace::Tracer;
-use crate::CheckpointConfig;
 
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
@@ -466,7 +465,7 @@ impl PageServerHandler {
         // We only want to persist the data, and it doesn't matter if it's in the
         // shape of deltas or images.
         info!("flushing layers");
-        timeline.checkpoint(CheckpointConfig::Flush).await?;
+        timeline.freeze_and_flush().await?;
 
         info!("done");
         Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index edd7a3cb07..03387d00fe 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -62,7 +62,7 @@ use crate::tenant_config::TenantConfOpt;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
-use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;
 pub use pageserver_api::models::TenantState;
 
 use toml_edit;
@@ -125,7 +125,7 @@ pub struct Tenant {
     timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
-    // `timelines` mutex during all GC iteration (especially with enforced checkpoint)
+    // `timelines` mutex during all GC iteration
     // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
     // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
     // timeout...
@@ -249,7 +249,7 @@ impl UninitializedTimeline<'_> {
                 .context("Failed to import basebackup")
         })?;
 
-        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
+        // Flush loop needs to be spawned in order to be able to flush.
         // We want to run proper checkpoint before we mark timeline as available to outside world
         // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
         raw_timeline.maybe_spawn_flush_loop();
@@ -259,9 +259,9 @@ impl UninitializedTimeline<'_> {
         });
 
         raw_timeline
-            .checkpoint(CheckpointConfig::Flush)
+            .freeze_and_flush()
             .await
-            .context("Failed to checkpoint after basebackup import")?;
+            .context("Failed to flush after basebackup import")?;
 
         let timeline = self.initialize()?;
 
@@ -371,7 +371,7 @@ impl Drop for TimelineUninitMark {
 
 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
-//     Checkpoint comes, we update local metadata and start upload task but after that
+//     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
 //     pageserver crashes. During startup we'll load new metadata, and then reset it
 //     to the state of remote one. But current layermap will have layers from the old
 //     metadata which is inconsistent.
@@ -1225,24 +1225,21 @@ impl Tenant {
     ///
     /// Used at graceful shutdown.
     ///
-    pub async fn checkpoint(&self) -> anyhow::Result<()> {
+    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
         // Scan through the hashmap and collect a list of all the timelines,
         // while holding the lock. Then drop the lock and actually perform the
-        // checkpoints. We don't want to block everything else while the
-        // checkpoint runs.
-        let timelines_to_checkpoint = {
+        // flushing. We don't want to block everything else while the
+        // flushing is performed.
+        let timelines_to_flush = {
             let timelines = self.timelines.lock().unwrap();
             timelines
                 .iter()
-                .map(|(id, timeline)| (*id, Arc::clone(timeline)))
+                .map(|(_id, timeline)| Arc::clone(timeline))
                 .collect::<Vec<_>>()
         };
 
-        for (id, timeline) in &timelines_to_checkpoint {
-            timeline
-                .checkpoint(CheckpointConfig::Flush)
-                .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
-                .await?;
+        for timeline in &timelines_to_flush {
+            timeline.freeze_and_flush().await?;
         }
 
         Ok(())
@@ -2095,8 +2092,13 @@ impl Tenant {
         });
 
         unfinished_timeline
-            .checkpoint(CheckpointConfig::Flush).await
-            .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
+            .freeze_and_flush()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}"
+                )
+            })?;
 
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
@@ -2831,7 +2833,7 @@ mod tests {
             writer.finish_write(lsn);
             lsn += 0x10;
         }
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         {
             let writer = tline.writer();
             writer.put(
@@ -2848,7 +2850,7 @@ mod tests {
             )?;
             writer.finish_write(lsn);
         }
-        tline.checkpoint(CheckpointConfig::Forced).await
+        tline.freeze_and_flush().await
     }
 
     #[tokio::test]
@@ -2863,7 +2865,7 @@ mod tests {
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        // FIXME: this doesn't actually remove any layer currently, given how the checkpointing
+        // FIXME: this doesn't actually remove any layer currently, given how the flushing
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
@@ -3098,7 +3100,7 @@ mod tests {
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3106,7 +3108,7 @@ mod tests {
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3114,7 +3116,7 @@ mod tests {
         writer.finish_write(Lsn(0x30));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         let writer = tline.writer();
@@ -3122,7 +3124,7 @@ mod tests {
         writer.finish_write(Lsn(0x40));
         drop(writer);
 
-        tline.checkpoint(CheckpointConfig::Forced).await?;
+        tline.freeze_and_flush().await?;
         tline.compact().await?;
 
         assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
@@ -3135,8 +3137,8 @@ mod tests {
     }
 
     //
-    // Insert 1000 key-value pairs with increasing keys, checkpoint,
-    // repeat 50 times.
+    // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
+    // Repeat 50 times.
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
@@ -3172,7 +3174,7 @@ mod tests {
             let cutoff = tline.get_last_record_lsn();
 
             tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
@@ -3240,11 +3242,10 @@ mod tests {
                 );
             }
 
-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
@@ -3323,11 +3324,10 @@ mod tests {
                 );
             }
 
-            // Perform a cycle of checkpoint, compaction, and GC
-            println!("checkpointing {}", lsn);
+            // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
-            tline.checkpoint(CheckpointConfig::Forced).await?;
+            tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b1f580c32f..0697ec4bd6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -57,7 +57,6 @@ use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
-use crate::CheckpointConfig;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
@@ -499,22 +498,10 @@ impl Timeline {
     }
 
     /// Flush to disk all data that was written with the put_* functions
-    ///
-    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
-    /// know anything about them here in the repository.
     #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
-    pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
-        match cconf {
-            CheckpointConfig::Flush => {
-                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await
-            }
-            CheckpointConfig::Forced => {
-                self.freeze_inmem_layer(false);
-                self.flush_frozen_layers_and_wait().await?;
-                self.compact().await
-            }
-        }
+    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+        self.freeze_inmem_layer(false);
+        self.flush_frozen_layers_and_wait().await
     }
 
     pub async fn compact(&self) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index 615dcce4a1..85be420cb8 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -196,7 +196,7 @@ pub async fn shutdown_all_tenants() {
         let tenant_id = tenant.tenant_id();
         debug!("shutdown tenant {tenant_id}");
 
-        if let Err(err) = tenant.checkpoint().await {
+        if let Err(err) = tenant.freeze_and_flush().await {
             error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
         }
     }

From 4cda9919bf30baed0255eee2e61dd1797ffb2cb1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 13:34:18 +0200
Subject: [PATCH 038/132] Use Self to emphasize this is a constructor

---
 libs/remote_storage/src/lib.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 28858fcbab..568cb7224f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -164,18 +164,16 @@ impl Deref for GenericRemoteStorage {
 }
 
 impl GenericRemoteStorage {
-    pub fn from_config(
-        storage_config: &RemoteStorageConfig,
-    ) -> anyhow::Result<GenericRemoteStorage> {
+    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs(root) => {
                 info!("Using fs root '{}' as a remote storage", root.display());
-                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
+                Self::LocalFs(LocalFs::new(root.clone())?)
             }
             RemoteStorageKind::AwsS3(s3_config) => {
                 info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
                       s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
             }
         })
     }

From 8e2edfcf39c21e41cd3c2e74524ec7a777714555 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 13:34:21 +0200
Subject: [PATCH 039/132] Retry remote downloads.

Remote operations fail sometimes due to network failures or other
external reasons. Add retry logic to all the remote downloads, so that
a transient failure at pageserver startup or tenant attach doesn't
cause the whole tenant to be marked as Broken.

Like in the uploads retry logic, we print the failure to the log as a
WARNing after three retries, but keep retrying. We will retry up to 10
times now, before returning the error to the caller.

To test the retries, I created a new RemoteStorage wrapper that simulates
failures, by returning an error for the first N times that a remote
operation is performed. It can be enabled by setting a new
"test_remote_failures" option in the pageserver config file.

Fixes #3112
---
 libs/remote_storage/src/lib.rs                |  14 +-
 libs/remote_storage/src/local_fs.rs           |   8 +-
 libs/remote_storage/src/s3_bucket.rs          |  12 +-
 libs/remote_storage/src/simulate_failures.rs  | 129 +++++++++++
 pageserver/src/bin/pageserver.rs              |  10 +-
 pageserver/src/config.rs                      |  17 ++
 pageserver/src/storage_sync2.rs               |  57 ++++-
 pageserver/src/storage_sync2/download.rs      | 202 +++++++++++++-----
 test_runner/fixtures/neon_fixtures.py         |  22 ++
 test_runner/regress/test_remote_storage.py    |  24 ++-
 test_runner/regress/test_tenant_detach.py     |  52 +++++
 .../test_tenants_with_remote_storage.py       |  16 ++
 12 files changed, 480 insertions(+), 83 deletions(-)
 create mode 100644 libs/remote_storage/src/simulate_failures.rs

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 568cb7224f..1091a8bd5c 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -7,6 +7,7 @@
 //!
 mod local_fs;
 mod s3_bucket;
+mod simulate_failures;
 
 use std::{
     collections::HashMap,
@@ -24,7 +25,7 @@ use tokio::io;
 use toml_edit::Item;
 use tracing::info;
 
-pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
 
 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -77,7 +78,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
     /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
     /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
     /// so this method doesnt need to.
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError>;
 
     /// Streams the local file contents into remote into the remote storage entry.
     async fn upload(
@@ -150,6 +154,7 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
     LocalFs(LocalFs),
     AwsS3(Arc<S3Bucket>),
+    Unreliable(Arc<UnreliableWrapper>),
 }
 
 impl Deref for GenericRemoteStorage {
@@ -159,6 +164,7 @@ impl Deref for GenericRemoteStorage {
         match self {
             GenericRemoteStorage::LocalFs(local_fs) => local_fs,
             GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
+            GenericRemoteStorage::Unreliable(s) => s.as_ref(),
         }
     }
 }
@@ -178,6 +184,10 @@ impl GenericRemoteStorage {
         })
     }
 
+    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    }
+
     /// Takes storage object contents and its size and uploads to remote storage,
     /// mapping `from_path` to the corresponding remote object id in the storage.
     ///
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 50a84eb33f..f1289569ae 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -92,13 +92,17 @@ impl RemoteStorage for LocalFs {
             .collect())
     }
 
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         let path = match prefix {
             Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
             None => Cow::Borrowed(&self.storage_root),
         };
         Ok(get_all_files(path.as_ref(), false)
-            .await?
+            .await
+            .map_err(DownloadError::Other)?
             .into_iter()
             .map(|path| {
                 path.strip_prefix(&self.storage_root)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 740f3753d8..18a2c5dedd 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -286,7 +286,10 @@ impl RemoteStorage for S3Bucket {
 
     /// See the doc for `RemoteStorage::list_prefixes`
     /// Note: it wont include empty "directories"
-    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
         let list_prefix = prefix
             .map(|p| self.relative_path_to_s3_object(p))
@@ -308,7 +311,8 @@ impl RemoteStorage for S3Bucket {
                 .concurrency_limiter
                 .acquire()
                 .await
-                .context("Concurrency limiter semaphore got closed during S3 list")?;
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
 
             metrics::inc_list_objects();
 
@@ -324,7 +328,9 @@ impl RemoteStorage for S3Bucket {
                 .map_err(|e| {
                     metrics::inc_list_objects_fail();
                     e
-                })?;
+                })
+                .context("Failed to list S3 prefixes")
+                .map_err(DownloadError::Other)?;
 
             document_keys.extend(
                 fetch_response
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
new file mode 100644
index 0000000000..643bb99dce
--- /dev/null
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -0,0 +1,129 @@
+//! This module provides a wrapper around a real RemoteStorage implementation that
+//! causes the first N attempts at each upload or download operatio to fail. For
+//! testing purposes.
+use std::collections::hash_map::Entry;
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
+
+pub struct UnreliableWrapper {
+    inner: crate::GenericRemoteStorage,
+
+    // This many attempts of each operation will fail, then we let it succeed.
+    attempts_to_fail: u64,
+
+    // Tracks how many failed attempts of each operation has been made.
+    attempts: Mutex<HashMap<RemoteOp, u64>>,
+}
+
+/// Used to identify retries of different unique operation.
+#[derive(Debug, Hash, Eq, PartialEq)]
+enum RemoteOp {
+    List,
+    ListPrefixes(Option<RemotePath>),
+    Upload(RemotePath),
+    Download(RemotePath),
+    Delete(RemotePath),
+}
+
+impl UnreliableWrapper {
+    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+        assert!(attempts_to_fail > 0);
+        UnreliableWrapper {
+            inner,
+            attempts_to_fail,
+            attempts: Mutex::new(HashMap::new()),
+        }
+    }
+
+    ///
+    /// Common functionality for all operations.
+    ///
+    /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
+    /// attempts, let the operation go ahead, and clear the counter.
+    ///
+    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
+        let mut attempts = self.attempts.lock().unwrap();
+
+        match attempts.entry(op) {
+            Entry::Occupied(mut e) => {
+                let attempts_before_this = {
+                    let p = e.get_mut();
+                    *p += 1;
+                    *p
+                };
+
+                if attempts_before_this >= self.attempts_to_fail {
+                    // let it succeed
+                    e.remove();
+                    Ok(attempts_before_this)
+                } else {
+                    let error =
+                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                    Err(DownloadError::Other(error))
+                }
+            }
+            Entry::Vacant(e) => {
+                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
+                e.insert(1);
+                Err(DownloadError::Other(error))
+            }
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for UnreliableWrapper {
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::List)?;
+        self.inner.list().await
+    }
+
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list_prefixes(prefix).await
+    }
+
+    async fn upload(
+        &self,
+        data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        // S3 PUT request requires the content length to be specified,
+        // otherwise it starts to fail with the concurrent connection count increasing.
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.upload(data, data_size_bytes, to, metadata).await
+    }
+
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner.download(from).await
+    }
+
+    async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        // Note: We treat any download_byte_range as an "attempt" of the same
+        // operation. We don't pay attention to the ranges. That's good enough
+        // for now.
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.inner
+            .download_byte_range(from, start_inclusive, end_exclusive)
+            .await
+    }
+
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::Delete(path.clone()))?;
+        self.inner.delete(path).await
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 47e9382e6d..86ce318d0a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -12,14 +12,13 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, task_mgr,
+    http, page_cache, page_service, profiling, storage_sync2, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
     },
     tenant_mgr, virtual_file,
 };
-use remote_storage::GenericRemoteStorage;
 use utils::{
     auth::JwtAuth,
     logging,
@@ -281,12 +280,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     };
 
     // Set up remote storage client
-    let remote_storage = conf
-        .remote_storage_config
-        .as_ref()
-        .map(GenericRemoteStorage::from_config)
-        .transpose()
-        .context("Failed to init generic remote storage")?;
+    let remote_storage = storage_sync2::create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
     BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9971ddc0f7..93c221e622 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -143,6 +143,8 @@ pub struct PageServerConf {
 
     /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
+
+    pub test_remote_failures: u64,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -221,6 +223,8 @@ struct PageServerConfigBuilder {
     log_format: BuilderValue<LogFormat>,
 
     concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
+
+    test_remote_failures: BuilderValue<u64>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -256,6 +260,8 @@ impl Default for PageServerConfigBuilder {
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
+
+            test_remote_failures: Set(0),
         }
     }
 }
@@ -336,6 +342,10 @@ impl PageServerConfigBuilder {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
 
+    pub fn test_remote_failures(&mut self, fail_first: u64) {
+        self.test_remote_failures = BuilderValue::Set(fail_first);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         Ok(PageServerConf {
             listen_pg_addr: self
@@ -384,6 +394,9 @@ impl PageServerConfigBuilder {
                 .ok_or(anyhow!(
                     "missing concurrent_tenant_size_logical_size_queries"
                 ))?,
+            test_remote_failures: self
+                .test_remote_failures
+                .ok_or(anyhow!("missing test_remote_failuers"))?,
         })
     }
 }
@@ -555,6 +568,7 @@ impl PageServerConf {
                     let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
                     ConfigurableSemaphore::new(permits)
                 }),
+                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -676,6 +690,7 @@ impl PageServerConf {
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+            test_remote_failures: 0,
         }
     }
 }
@@ -849,6 +864,7 @@ log_format = 'json'
                 )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                test_remote_failures: 0,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -893,6 +909,7 @@ log_format = 'json'
                 broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                test_remote_failures: 0,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 14763985ab..14ab332eba 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -227,6 +227,18 @@ use crate::{
 
 use utils::id::{TenantId, TimelineId};
 
+// Occasional network issues and such can cause remote operations to fail, and
+// that's expected. If a download fails, we log it at info-level, and retry.
+// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
+// level instead, as repeated failures can mean a more serious problem. If it
+// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
+const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+
+// Similarly log failed uploads and deletions at WARN level, after this many
+// retries. Uploads and deletions are retried forever, though.
+const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -977,12 +989,14 @@ impl RemoteTimelineClient {
                 Err(e) => {
                     let retries = task.retries.fetch_add(1, Ordering::SeqCst);
 
-                    // uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
-                    // such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
-                    // people and tests until the retries are definitely causing delays.
-                    if retries < 3 {
+                    // Uploads can fail due to rate limits (IAM, S3), spurious network problems,
+                    // or other external reasons. Such issues are relatively regular, so log them
+                    // at info level at first, and only WARN if the operation fails repeatedly.
+                    //
+                    // (See similar logic for downloads in `download::download_retry`)
+                    if retries < FAILED_UPLOAD_WARN_THRESHOLD {
                         info!(
-                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            "failed to perform remote task {}, will retry (attempt {}): {:#}",
                             task.op, retries, e
                         );
                     } else {
@@ -1148,6 +1162,39 @@ pub fn create_remote_timeline_client(
     })
 }
 
+///
+/// Create GenericRemoteStorage client from the pageserver config
+///
+pub fn create_remote_storage_client(
+    conf: &'static PageServerConf,
+) -> anyhow::Result<Option<GenericRemoteStorage>> {
+    let config = if let Some(config) = &conf.remote_storage_config {
+        config
+    } else {
+        // No remote storage configured.
+        return Ok(None);
+    };
+
+    // Create the client
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+
+    // If `test_remote_failures` is non-zero, wrap the client with a
+    // wrapper that simulates failures.
+    if conf.test_remote_failures > 0 {
+        if !cfg!(feature = "testing") {
+            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
+        }
+        info!(
+            "Simulating remote failures for first {} attempts of each op",
+            conf.test_remote_failures
+        );
+        remote_storage =
+            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+    }
+
+    Ok(Some(remote_storage))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
index 0d25d88a97..c81be05981 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -1,21 +1,28 @@
 //! Helper functions to download files from remote storage with a RemoteStorage
+//!
+//! The functions in this module retry failed operations automatically, according
+//! to the FAILED_DOWNLOAD_RETRIES constant.
+
 use std::collections::HashSet;
+use std::future::Future;
 use std::path::Path;
 
-use anyhow::{bail, Context};
+use anyhow::{anyhow, Context};
 use futures::stream::{FuturesUnordered, StreamExt};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{debug, info_span, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};
 
 use crate::config::PageServerConf;
 use crate::storage_sync::index::LayerFileMetadata;
 use crate::tenant::filename::LayerFileName;
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
 use super::index::{IndexPart, IndexPartUnclean};
+use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
@@ -33,12 +40,14 @@ pub async fn download_layer_file<'a>(
     timeline_id: TimelineId,
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
-) -> anyhow::Result<u64> {
+) -> Result<u64, DownloadError> {
     let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
 
     let local_path = timeline_path.join(layer_file_name.file_name());
 
-    let remote_path = conf.remote_path(&local_path)?;
+    let remote_path = conf
+        .remote_path(&local_path)
+        .map_err(DownloadError::Other)?;
 
     // Perform a rename inspired by durable_rename from file_utils.c.
     // The sequence:
@@ -52,21 +61,30 @@ pub async fn download_layer_file<'a>(
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
     let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
 
-    // TODO: this doesn't use the cached fd for some reason?
-    let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
-        format!(
-            "Failed to create a destination file for layer '{}'",
-            temp_file_path.display()
-        )
-    })?;
-    let mut download = storage.download(&remote_path).await.with_context(|| {
-        format!(
-            "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
-        )
-    })?;
-    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
-        format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
-    })?;
+    let (mut destination_file, bytes_amount) = download_retry(
+        || async {
+            // TODO: this doesn't use the cached fd for some reason?
+            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
+                format!(
+                    "Failed to create a destination file for layer '{}'",
+                    temp_file_path.display()
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let mut download = storage.download(&remote_path).await.with_context(|| {
+                format!(
+                    "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
+                )
+            })
+            .map_err(DownloadError::Other)?;
+            let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+                format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok((destination_file, bytes_amount))
+        },
+        &format!("download {remote_path:?}"),
+    ).await?;
 
     // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
     // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -76,19 +94,23 @@ pub async fn download_layer_file<'a>(
     // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
     // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
     // But for additional safety lets check/wait for any pending operations.
-    destination_file.flush().await.with_context(|| {
-        format!(
-            "failed to flush source file at {}",
-            temp_file_path.display()
-        )
-    })?;
+    destination_file
+        .flush()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to flush source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;
 
     match layer_metadata.file_size() {
         Some(expected) if expected != bytes_amount => {
-            anyhow::bail!(
-                "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
+            return Err(DownloadError::Other(anyhow!(
+                "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
                 temp_file_path.display()
-            );
+            )));
         }
         Some(_) | None => {
             // matches, or upgrading from an earlier IndexPart version
@@ -96,23 +118,38 @@ pub async fn download_layer_file<'a>(
     }
 
     // not using sync_data because it can lose file size update
-    destination_file.sync_all().await.with_context(|| {
-        format!(
-            "failed to fsync source file at {}",
-            temp_file_path.display()
-        )
-    })?;
+    destination_file
+        .sync_all()
+        .await
+        .with_context(|| {
+            format!(
+                "failed to fsync source file at {}",
+                temp_file_path.display()
+            )
+        })
+        .map_err(DownloadError::Other)?;
     drop(destination_file);
 
     fail::fail_point!("remote-storage-download-pre-rename", |_| {
-        bail!("remote-storage-download-pre-rename failpoint triggered")
+        Err(DownloadError::Other(anyhow!(
+            "remote-storage-download-pre-rename failpoint triggered"
+        )))
     });
 
-    fs::rename(&temp_file_path, &local_path).await?;
+    fs::rename(&temp_file_path, &local_path)
+        .await
+        .with_context(|| {
+            format!(
+                "Could not rename download layer file to {}",
+                local_path.display(),
+            )
+        })
+        .map_err(DownloadError::Other)?;
 
     fsync_path(&local_path)
         .await
-        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
+        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
+        .map_err(DownloadError::Other)?;
 
     tracing::info!("download complete: {}", local_path.display());
 
@@ -143,14 +180,11 @@ pub async fn list_remote_timelines<'a>(
     let tenant_path = conf.timelines_path(&tenant_id);
     let tenant_storage_path = conf.remote_path(&tenant_path)?;
 
-    let timelines = storage
-        .list_prefixes(Some(&tenant_storage_path))
-        .await
-        .with_context(|| {
-            format!(
-                "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
-            )
-        })?;
+    let timelines = download_retry(
+        || storage.list_prefixes(Some(&tenant_storage_path)),
+        &format!("list prefixes for {tenant_path:?}"),
+    )
+    .await?;
 
     if timelines.is_empty() {
         anyhow::bail!("no timelines found on the remote storage")
@@ -209,16 +243,25 @@ pub async fn download_index_part(
         .remote_path(&index_part_path)
         .map_err(DownloadError::BadInput)?;
 
-    let mut index_part_download = storage.download(&part_storage_path).await?;
+    let index_part_bytes = download_retry(
+        || async {
+            let mut index_part_download = storage.download(&part_storage_path).await?;
 
-    let mut index_part_bytes = Vec::new();
-    tokio::io::copy(
-        &mut index_part_download.download_stream,
-        &mut index_part_bytes,
+            let mut index_part_bytes = Vec::new();
+            tokio::io::copy(
+                &mut index_part_download.download_stream,
+                &mut index_part_bytes,
+            )
+            .await
+            .with_context(|| {
+                format!("Failed to download an index part into file {index_part_path:?}")
+            })
+            .map_err(DownloadError::Other)?;
+            Ok(index_part_bytes)
+        },
+        &format!("download {part_storage_path:?}"),
     )
-    .await
-    .with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
-    .map_err(DownloadError::Other)?;
+    .await?;
 
     let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
         .with_context(|| {
@@ -230,3 +273,56 @@ pub async fn download_index_part(
 
     Ok(index_part)
 }
+
+///
+/// Helper function to handle retries for a download operation.
+///
+/// Remote operations can fail due to rate limits (IAM, S3), spurious network
+/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
+/// with backoff.
+///
+/// (See similar logic for uploads in `perform_upload_task`)
+async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+where
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, DownloadError>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
+                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
+                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(ref err)) => {
+                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
+                error!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 2eabc25ef6..287f157d97 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1904,6 +1904,28 @@ class NeonPageserver(PgProtocol):
 
         assert not errors
 
+    def log_contains(self, pattern: str) -> Optional[str]:
+        """Check that the pageserver log contains a line that matches the given regex"""
+        logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
+
+        contains_re = re.compile(pattern)
+
+        # XXX: Our rust logging machinery buffers the messages, so if you
+        # call this function immediately after it's been logged, there is
+        # no guarantee it is already present in the log file. This hasn't
+        # been a problem in practice, our python tests are not fast enough
+        # to hit that race condition.
+        while True:
+            line = logfile.readline()
+            if not line:
+                break
+
+            if contains_re.search(line):
+                # found it!
+                return line
+
+        return None
+
 
 def append_pageserver_param_overrides(
     params_to_update: List[str],
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index d8f8298fa6..94e483cdb5 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -56,6 +56,11 @@ def test_remote_storage_backup_and_restore(
         test_name="test_remote_storage_backup_and_restore",
     )
 
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
     data_id = 1
     data_secret = "very secret secret"
 
@@ -76,6 +81,7 @@ def test_remote_storage_backup_and_restore(
     env.pageserver.allowed_errors.append(
         ".*Cannot attach tenant .*?, local tenant directory already exists.*"
     )
+    env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*")
 
     pageserver_http = env.pageserver.http_client()
     pg = env.postgres.create_start("main")
@@ -87,16 +93,6 @@ def test_remote_storage_backup_and_restore(
 
     checkpoint_numbers = range(1, 3)
 
-    # On the first iteration, exercise retry code path by making the uploads
-    # fail for the first 3 times
-    action = "3*return->off"
-    pageserver_http.configure_failpoints(
-        [
-            ("before-upload-layer", action),
-            ("before-upload-index", action),
-        ]
-    )
-
     for checkpoint_number in checkpoint_numbers:
         with pg.cursor() as cur:
             cur.execute(
@@ -118,6 +114,14 @@ def test_remote_storage_backup_and_restore(
         wait_for_upload(client, tenant_id, timeline_id, current_lsn)
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
     ##### Stop the first pageserver instance, erase all its data
     env.postgres.stop_all()
     env.pageserver.stop()
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index ce1e334bfa..8bf0fb7548 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -32,6 +32,58 @@ def do_gc_target(
         log.info("gc http thread returning")
 
 
+# Basic detach and re-attach test
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_reattach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_reattach",
+    )
+
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    with pg.cursor() as cur:
+        cur.execute("CREATE TABLE t(key int primary key, value text)")
+        cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # Wait for the all data to be processed by the pageserver and uploaded in remote storage
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
+
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+
+    # Check that we had to retry the downloads
+    assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
+    assert env.pageserver.log_contains(".*download.*failed, will retry.*")
+
+
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 57aaa70559..4cd74e17e9 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -121,6 +121,11 @@ def test_tenants_attached_after_download(
     data_id = 1
     data_secret = "very secret secret"
 
+    # Exercise retry code path by making all uploads and downloads fail for the
+    # first time. The retries print INFO-messages to the log; we will check
+    # that they are present after the test.
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
     ##### First start, insert secret data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
@@ -159,6 +164,14 @@ def test_tenants_attached_after_download(
         wait_for_upload(client, tenant_id, timeline_id, current_lsn)
         log.info(f"upload of checkpoint {checkpoint_number} is done")
 
+    # Check that we had to retry the uploads
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadLayer.*, will retry.*"
+    )
+    assert env.pageserver.log_contains(
+        ".*failed to perform remote task UploadMetadata.*, will retry.*"
+    )
+
     ##### Stop the pageserver, erase its layer file to force it being downloaded from S3
     env.postgres.stop_all()
 
@@ -211,6 +224,9 @@ def test_tenants_attached_after_download(
     )
     assert detail_before["current_physical_size"] == detail_after["current_physical_size"]
 
+    # Check that we had to retry the downloads
+    assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")
+
 
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_upgrades_index_json_from_v0(

From 0c71dc627bf0727265ee8fec35cd38c234315842 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 20 Dec 2022 15:54:02 +0200
Subject: [PATCH 040/132] Tidy up walreceiver logs (#3147)

Closes https://github.com/neondatabase/neon/issues/3114

Improves walrecevier logs and remove `clone()` calls.
---
 .../src/walreceiver/connection_manager.rs     | 72 +++++++++++--------
 .../src/walreceiver/walreceiver_connection.rs | 26 +++++--
 2 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index 8048707480..a65703bca9 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -145,21 +145,17 @@ async fn connection_manager_loop_step(
                 let wal_connection = walreceiver_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
-                    TaskEvent::Update(c) => {
-                        match c {
-                            TaskStateUpdate::Init | TaskStateUpdate::Started => {},
-                            TaskStateUpdate::Progress(status) => {
-                                if status.has_processed_wal {
-                                    // We have advanced last_record_lsn by processing the WAL received
-                                    // from this safekeeper. This is good enough to clean unsuccessful
-                                    // retries history and allow reconnecting to this safekeeper without
-                                    // sleeping for a long time.
-                                    walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
-                                }
-                                wal_connection.status = status.to_owned();
-                            }
+                    TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
+                    TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
+                        if new_status.has_processed_wal {
+                            // We have advanced last_record_lsn by processing the WAL received
+                            // from this safekeeper. This is good enough to clean unsuccessful
+                            // retries history and allow reconnecting to this safekeeper without
+                            // sleeping for a long time.
+                            walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
                         }
-                    },
+                        wal_connection.status = new_status;
+                    }
                     TaskEvent::End(walreceiver_task_result) => {
                         match walreceiver_task_result {
                             Ok(()) => debug!("WAL receiving task finished"),
@@ -210,7 +206,18 @@ async fn connection_manager_loop_step(
                 }
             },
 
-            _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
+            Some(()) = async {
+                match time_until_next_retry {
+                    Some(sleep_time) => {
+                        tokio::time::sleep(sleep_time).await;
+                        Some(())
+                    },
+                    None => {
+                        debug!("No candidates to retry, waiting indefinitely for the broker events");
+                        None
+                    }
+                }
+            } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
         }
 
         if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
@@ -480,20 +487,25 @@ impl WalreceiverState {
             .values()
             .filter_map(|retry| retry.next_retry_at)
             .filter(|next_retry_at| next_retry_at > &now)
-            .min();
+            .min()?;
 
-        next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
+        (next_retry_at - now).to_std().ok()
     }
 
     /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
     fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
-        self.wal_stream_candidates.insert(
-            NodeId(timeline_update.safekeeper_id),
+        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
+        let old_entry = self.wal_stream_candidates.insert(
+            new_safekeeper_id,
             BrokerSkTimeline {
                 timeline: timeline_update,
                 latest_update: Utc::now().naive_utc(),
             },
         );
+
+        if old_entry.is_none() {
+            info!("New SK node was added: {new_safekeeper_id}");
+        }
     }
 
     /// Cleans up stale broker records and checks the rest for the new connection candidate.
@@ -720,12 +732,13 @@ impl WalreceiverState {
     /// Remove candidates which haven't sent broker updates for a while.
     fn cleanup_old_candidates(&mut self) {
         let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
+        let lagging_wal_timeout = self.lagging_wal_timeout;
 
         self.wal_stream_candidates.retain(|node_id, broker_info| {
             if let Ok(time_since_latest_broker_update) =
                 (Utc::now().naive_utc() - broker_info.latest_update).to_std()
             {
-                let should_retain = time_since_latest_broker_update < self.lagging_wal_timeout;
+                let should_retain = time_since_latest_broker_update < lagging_wal_timeout;
                 if !should_retain {
                     node_ids_to_remove.push(*node_id);
                 }
@@ -735,8 +748,11 @@ impl WalreceiverState {
             }
         });
 
-        for node_id in node_ids_to_remove {
-            self.wal_connection_retries.remove(&node_id);
+        if !node_ids_to_remove.is_empty() {
+            for node_id in node_ids_to_remove {
+                info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
+                self.wal_connection_retries.remove(&node_id);
+            }
         }
     }
 
@@ -883,10 +899,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
@@ -1045,10 +1061,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: connected_sk_id,
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
@@ -1110,10 +1126,10 @@ mod tests {
         state.wal_connection = Some(WalConnection {
             started_at: now,
             sk_id: NodeId(1),
-            status: connection_status.clone(),
+            status: connection_status,
             connection_task: TaskHandle::spawn(move |sender, _| async move {
                 sender
-                    .send(TaskStateUpdate::Progress(connection_status.clone()))
+                    .send(TaskStateUpdate::Progress(connection_status))
                     .ok();
                 Ok(())
             }),
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index cf2a99f1b5..5b7e60aa5e 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -35,7 +35,7 @@ use pq_proto::ReplicationFeedback;
 use utils::lsn::Lsn;
 
 /// Status of the connection.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub struct WalConnectionStatus {
     /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
     pub is_connected: bool,
@@ -83,7 +83,7 @@ pub async fn handle_walreceiver_connection(
         streaming_lsn: None,
         commit_lsn: None,
     };
-    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
         return Ok(());
     }
@@ -135,7 +135,7 @@ pub async fn handle_walreceiver_connection(
     connection_status.latest_connection_update = Utc::now().naive_utc();
     connection_status.latest_wal_update = Utc::now().naive_utc();
     connection_status.commit_lsn = Some(end_of_wal);
-    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+    if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
         warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
         return Ok(());
     }
@@ -184,7 +184,20 @@ pub async fn handle_walreceiver_connection(
             replication_message = physical_stream.next() => replication_message,
         }
     } {
-        let replication_message = replication_message?;
+        let replication_message = match replication_message {
+            Ok(message) => message,
+            Err(replication_error) => {
+                if replication_error.is_closed() {
+                    info!("Replication stream got closed");
+                    return Ok(());
+                } else {
+                    return Err(
+                        anyhow::Error::new(replication_error).context("replication stream error")
+                    );
+                }
+            }
+        };
+
         let now = Utc::now().naive_utc();
         let last_rec_lsn_before_msg = last_rec_lsn;
 
@@ -207,7 +220,7 @@ pub async fn handle_walreceiver_connection(
             }
             &_ => {}
         };
-        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
+        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
             warn!("Wal connection event listener dropped, aborting the connection: {e}");
             return Ok(());
         }
@@ -273,8 +286,7 @@ pub async fn handle_walreceiver_connection(
         if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
             // We have successfully processed at least one WAL record.
             connection_status.has_processed_wal = true;
-            if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone()))
-            {
+            if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
                 warn!("Wal connection event listener dropped, aborting the connection: {e}");
                 return Ok(());
             }

From 9a049aa846aeef3e5d1e3be70b670c6697ba8e35 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 01:42:54 +0200
Subject: [PATCH 041/132] Move code from tenant_mgr::delete_timeline to
 Tenant::delete_timeline.

It's better to request the tasks to shut down only after setting the
timeline state to Stopping. Otherwise, it's possible that a new task
spawns after we have waited for the existing tasks to shut down, but
before we have changed the state. We would fail to wait for them.

Feels nicer from a readability point of view too.
---
 pageserver/src/tenant.rs     | 23 ++++++++++++++++++++++-
 pageserver/src/tenant_mgr.rs | 21 ---------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 03387d00fe..af31fda06b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1274,8 +1274,29 @@ impl Tenant {
             timeline
         };
 
-        info!("waiting for layer_removal_cs.lock()");
+        // Now that the Timeline is in Stopping state, request all the related tasks to
+        // shut down.
+        //
+        // NB: If you call delete_timeline multiple times concurrently, they will
+        // all go through the motions here. Make sure the code here is idempotent,
+        // and don't error out if some of the shutdown tasks have already been
+        // completed!
+
+        // Stop the walreceiver first.
+        debug!("waiting for wal receiver to shutdown");
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_id),
+            Some(timeline_id),
+        )
+        .await;
+        debug!("wal receiver shutdown confirmed");
+
+        info!("waiting for timeline tasks to shutdown");
+        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
+
         // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
+        info!("waiting for layer_removal_cs.lock()");
         let layer_removal_guard = timeline.layer_removal_cs.lock().await;
         info!("got layer_removal_cs.lock(), deleting layer files");
 
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs
index 85be420cb8..e4e9d0c6e8 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -262,27 +262,6 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
 }
 
 pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
-    // Start with the shutdown of timeline tasks (this shuts down the walreceiver)
-    // It is important that we do not take locks here, and do not check whether the timeline exists
-    // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
-    // we cannot create new timelines and tenants, and that can take quite some time,
-    // it can even become stuck due to a bug making whole pageserver unavailable for some operations
-    // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
-    // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
-    // will synchronize and either fail with the not found error or succeed
-
-    debug!("waiting for wal receiver to shutdown");
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::WalReceiverManager),
-        Some(tenant_id),
-        Some(timeline_id),
-    )
-    .await;
-    debug!("wal receiver shutdown confirmed");
-
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
-    info!("timeline task shutdown completed");
     match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
             tenant.delete_timeline(timeline_id).await?;

From 43fd89eaa7e9100c5f74ce7082dc19e0a9f3135d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 20 Dec 2022 01:52:50 +0200
Subject: [PATCH 042/132] Improve comments, formatting around layer_removal_cs
 lock.

---
 pageserver/src/tenant.rs | 49 ++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index af31fda06b..7a03f52155 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1295,26 +1295,41 @@ impl Tenant {
         info!("waiting for timeline tasks to shutdown");
         task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
 
-        // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
-        info!("waiting for layer_removal_cs.lock()");
-        let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-        info!("got layer_removal_cs.lock(), deleting layer files");
+        {
+            // Grab the layer_removal_cs lock, and actually perform the deletion.
+            //
+            // This lock prevents multiple concurrent delete_timeline calls from
+            // stepping on each other's toes, while deleting the files. It also
+            // prevents GC or compaction from running at the same time.
+            //
+            // Note that there are still other race conditions between
+            // GC, compaction and timeline deletion. GC task doesn't
+            // register itself properly with the timeline it's
+            // operating on. See
+            // https://github.com/neondatabase/neon/issues/2671
+            //
+            // No timeout here, GC & Compaction should be responsive to the
+            // `TimelineState::Stopping` change.
+            info!("waiting for layer_removal_cs.lock()");
+            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+            info!("got layer_removal_cs.lock(), deleting layer files");
 
-        // NB: storage_sync upload tasks that reference these layers have been cancelled
-        //     by the caller.
+            // NB: storage_sync upload tasks that reference these layers have been cancelled
+            //     by the caller.
 
-        let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-        // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-        // with some layers missing.
-        std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
-            format!(
-                "Failed to remove local timeline directory '{}'",
-                local_timeline_directory.display()
-            )
-        })?;
-        info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
+            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
+            // with some layers missing.
+            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
+                format!(
+                    "Failed to remove local timeline directory '{}'",
+                    local_timeline_directory.display()
+                )
+            })?;
 
-        drop(layer_removal_guard);
+            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            drop(layer_removal_guard);
+        }
 
         // Remove the timeline from the map.
         let mut timelines = self.timelines.lock().unwrap();

From 4235f97c6a3276d90a1c3630fa78d4b1495df260 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 29 Nov 2022 20:50:58 +0200
Subject: [PATCH 043/132] Implement consumption metrics collection.

Add new background job to collect billing metrics for each tenant and
send them to the HTTP endpoint.
Metrics are cached, so we don't send non-changed metrics.

Add metric collection config parameters:
metric_collection_endpoint (default None, i.e. disabled)
metric_collection_interval (default 60s)

Add test_metric_collection.py to test metric collection
and sending to the mocked HTTP endpoint.

Use port distributor in metric_collection test

review fixes: only update cache after metrics were send successfully, simplify code

disable metric collection if metric_collection_endpoint is not provided in config
---
 Cargo.lock                                    |   3 +
 pageserver/Cargo.toml                         |   5 +-
 pageserver/src/billing_metrics.rs             | 283 ++++++++++++++++++
 pageserver/src/bin/pageserver.rs              |  20 ++
 pageserver/src/config.rs                      |  47 +++
 pageserver/src/lib.rs                         |   1 +
 pageserver/src/storage_sync2.rs               |   4 +
 pageserver/src/task_mgr.rs                    |   3 +
 pageserver/src/tenant.rs                      |  16 +
 poetry.lock                                   |  44 ++-
 pyproject.toml                                |   1 +
 test_runner/regress/test_metric_collection.py | 138 +++++++++
 workspace_hack/Cargo.toml                     |   4 +-
 13 files changed, 557 insertions(+), 12 deletions(-)
 create mode 100644 pageserver/src/billing_metrics.rs
 create mode 100644 test_runner/regress/test_metric_collection.py

diff --git a/Cargo.lock b/Cargo.lock
index 665000746d..2737a4d934 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2415,6 +2415,7 @@ dependencies = [
  "rand",
  "regex",
  "remote_storage",
+ "reqwest",
  "rstar",
  "scopeguard",
  "serde",
@@ -4753,6 +4754,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "bytes",
+ "chrono",
  "clap 4.0.29",
  "crossbeam-utils",
  "either",
@@ -4776,6 +4778,7 @@ dependencies = [
  "reqwest",
  "scopeguard",
  "serde",
+ "serde_json",
  "socket2",
  "stable_deref_trait",
  "syn",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 24642ca2f7..f5acfcbdc0 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -18,7 +18,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 byteorder = "1.4.3"
 bytes = "1.0.1"
-chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
+chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] }
 clap = { version = "4.0", features = ["string"] }
 close_fds = "0.3.2"
 const_format = "0.2.21"
@@ -45,7 +45,7 @@ regex = "1.4.5"
 rstar = "0.9.3"
 scopeguard = "1.1.0"
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1"
+serde_json = { version = "1.0", features = ["raw_value"] }
 serde_with = "2.0"
 signal-hook = "0.3.10"
 svg_fmt = "0.4.1"
@@ -69,6 +69,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
 tenant_size_model = { path = "../libs/tenant_size_model" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
+reqwest = "0.11.13"
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
new file mode 100644
index 0000000000..c5da54b8fc
--- /dev/null
+++ b/pageserver/src/billing_metrics.rs
@@ -0,0 +1,283 @@
+//!
+//! Periodically collect consumption metrics for all active tenants
+//! and push them to a HTTP endpoint.
+//! Cache metrics to send only the updated ones.
+//!
+
+use anyhow;
+use tracing::*;
+use utils::id::TimelineId;
+
+use crate::task_mgr;
+use crate::tenant_mgr;
+use pageserver_api::models::TenantState;
+use utils::id::TenantId;
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fmt;
+use std::str::FromStr;
+use std::time::Duration;
+
+use chrono::{DateTime, Utc};
+use reqwest::Url;
+
+/// BillingMetric struct that defines the format for one metric entry
+/// i.e.
+///
+/// ```json
+/// {
+/// "metric": "remote_storage_size",
+/// "type": "absolute",
+/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
+/// "timeline_id": "00000000000000000000000000000000",
+/// "time": ...,
+/// "value": 12345454,
+/// }
+/// ```
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct BillingMetric {
+    pub metric: BillingMetricKind,
+    pub metric_type: &'static str,
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub time: DateTime<Utc>,
+    pub value: u64,
+}
+
+impl BillingMetric {
+    pub fn new_absolute(
+        metric: BillingMetricKind,
+        tenant_id: TenantId,
+        timeline_id: Option<TimelineId>,
+        value: u64,
+    ) -> Self {
+        Self {
+            metric,
+            metric_type: "absolute",
+            tenant_id,
+            timeline_id,
+            time: Utc::now(),
+            value,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum BillingMetricKind {
+    /// Amount of WAL produced , by a timeline, i.e. last_record_lsn
+    /// This is an absolute, per-timeline metric.
+    WrittenSize,
+    /// Size of all tenant branches including WAL
+    /// This is an absolute, per-tenant metric.
+    /// This is the same metric that tenant/tenant_id/size endpoint returns.
+    SyntheticStorageSize,
+    /// Size of all the files in the tenant's directory on disk on the pageserver.
+    /// This is an absolute, per-tenant metric.
+    /// See also prometheus metric CURRENT_PHYSICAL_SIZE.
+    PhysicalSize,
+    /// Size of the remote storage (S3) directory.
+    /// This is an absolute, per-tenant metric.
+    RemoteStorageSize,
+}
+
+impl FromStr for BillingMetricKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "written_size" => Ok(Self::WrittenSize),
+            "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
+            "physical_size" => Ok(Self::PhysicalSize),
+            "remote_storage_size" => Ok(Self::RemoteStorageSize),
+            _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
+        }
+    }
+}
+
+impl fmt::Display for BillingMetricKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(match self {
+            BillingMetricKind::WrittenSize => "written_size",
+            BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
+            BillingMetricKind::PhysicalSize => "physical_size",
+            BillingMetricKind::RemoteStorageSize => "remote_storage_size",
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct BillingMetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: BillingMetricKind,
+}
+
+#[derive(serde::Serialize)]
+struct EventChunk<'a> {
+    events: &'a [BillingMetric],
+}
+
+/// Main thread that serves metrics collection
+pub async fn collect_metrics(
+    metric_collection_endpoint: &Url,
+    metric_collection_interval: Duration,
+) -> anyhow::Result<()> {
+    let mut ticker = tokio::time::interval(metric_collection_interval);
+
+    info!("starting collect_metrics");
+
+    // define client here to reuse it for all requests
+    let client = reqwest::Client::new();
+    let mut cached_metrics: HashMap<BillingMetricsKey, u64> = HashMap::new();
+
+    loop {
+        tokio::select! {
+            _ = task_mgr::shutdown_watcher() => {
+                info!("collect_metrics received cancellation request");
+                return Ok(());
+            },
+            _ = ticker.tick() => {
+                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint).await?;
+            }
+        }
+    }
+}
+
+/// One iteration of metrics collection
+///
+/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
+/// Cache metrics to avoid sending the same metrics multiple times.
+pub async fn collect_metrics_task(
+    client: &reqwest::Client,
+    cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
+    metric_collection_endpoint: &reqwest::Url,
+) -> anyhow::Result<()> {
+    let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
+    trace!(
+        "starting collect_metrics_task. metric_collection_endpoint: {}",
+        metric_collection_endpoint
+    );
+
+    // get list of tenants
+    let tenants = tenant_mgr::list_tenants().await;
+
+    // iterate through list of Active tenants and collect metrics
+    for (tenant_id, tenant_state) in tenants {
+        if tenant_state != TenantState::Active {
+            continue;
+        }
+
+        let tenant = tenant_mgr::get_tenant(tenant_id, true).await?;
+
+        let mut tenant_physical_size = 0;
+
+        // iterate through list of timelines in tenant
+        for timeline in tenant.list_timelines().iter() {
+            let timeline_written_size = u64::from(timeline.get_last_record_lsn());
+
+            current_metrics.push((
+                BillingMetricsKey {
+                    tenant_id,
+                    timeline_id: Some(timeline.timeline_id),
+                    metric: BillingMetricKind::WrittenSize,
+                },
+                timeline_written_size,
+            ));
+
+            let timeline_size = timeline.get_physical_size();
+            tenant_physical_size += timeline_size;
+
+            debug!(
+                "per-timeline current metrics for tenant: {}: timeline {} physical_size={} last_record_lsn {} (as bytes)",
+                tenant_id, timeline.timeline_id, timeline_size, timeline_written_size)
+        }
+
+        let tenant_remote_size = tenant.get_remote_size().await?;
+        debug!(
+            "collected current metrics for tenant: {}: state={:?} tenant_physical_size={} remote_size={}",
+            tenant_id, tenant_state, tenant_physical_size, tenant_remote_size
+        );
+
+        current_metrics.push((
+            BillingMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: BillingMetricKind::PhysicalSize,
+            },
+            tenant_physical_size,
+        ));
+
+        current_metrics.push((
+            BillingMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: BillingMetricKind::RemoteStorageSize,
+            },
+            tenant_remote_size,
+        ));
+
+        // TODO add SyntheticStorageSize metric
+    }
+
+    // Filter metrics
+    current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+        Some(val) => val != curr_val,
+        None => true,
+    });
+
+    if current_metrics.is_empty() {
+        trace!("no new metrics to send");
+        return Ok(());
+    }
+
+    // Send metrics.
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    const CHUNK_SIZE: usize = 1000;
+    let chunks = current_metrics.chunks(CHUNK_SIZE);
+
+    let mut chunk_to_send: Vec<BillingMetric> = Vec::with_capacity(1000);
+
+    for chunk in chunks {
+        chunk_to_send.clear();
+        // enrich metrics with timestamp and metric_kind before sending
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
+            BillingMetric::new_absolute(
+                curr_key.metric,
+                curr_key.tenant_id,
+                curr_key.timeline_id,
+                *curr_val,
+            )
+        }));
+
+        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
+            events: &chunk_to_send,
+        })
+        .expect("BillingMetric should not fail serialization");
+
+        let res = client
+            .post(metric_collection_endpoint.clone())
+            .json(&chunk_json)
+            .send()
+            .await;
+
+        match res {
+            Ok(res) => {
+                if res.status().is_success() {
+                    // update cached metrics after they were sent successfully
+                    for (curr_key, curr_val) in chunk.iter() {
+                        cached_metrics.insert(curr_key.clone(), *curr_val);
+                    }
+                } else {
+                    error!("metrics endpoint refused the sent metrics: {:?}", res);
+                }
+            }
+            Err(err) => {
+                error!("failed to send metrics: {:?}", err);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 86ce318d0a..cc403ec2ea 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -310,6 +310,26 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 Ok(())
             },
         );
+
+        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            task_mgr::spawn(
+                MGMT_REQUEST_RUNTIME.handle(),
+                TaskKind::MetricsCollection,
+                None,
+                None,
+                "consumption metrics collection",
+                true,
+                async move {
+                    pageserver::billing_metrics::collect_metrics(
+                        metric_collection_endpoint,
+                        conf.metric_collection_interval,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                },
+            );
+        }
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 93c221e622..c6f417390f 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -12,6 +12,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 
 use once_cell::sync::OnceCell;
+use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
@@ -55,6 +56,8 @@ pub mod defaults {
     pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "60 s";
+    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     ///
     /// Default built-in configuration file.
     ///
@@ -78,6 +81,8 @@ pub mod defaults {
 
 #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
 
+#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -144,6 +149,10 @@ pub struct PageServerConf {
     /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
     pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
 
+    // How often to collect metrics and send them to the metrics endpoint.
+    pub metric_collection_interval: Duration,
+    pub metric_collection_endpoint: Option<Url>,
+
     pub test_remote_failures: u64,
 }
 
@@ -224,6 +233,9 @@ struct PageServerConfigBuilder {
 
     concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
 
+    metric_collection_interval: BuilderValue<Duration>,
+    metric_collection_endpoint: BuilderValue<Option<Url>>,
+
     test_remote_failures: BuilderValue<u64>,
 }
 
@@ -260,6 +272,11 @@ impl Default for PageServerConfigBuilder {
             log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
 
             concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
+            metric_collection_interval: Set(humantime::parse_duration(
+                DEFAULT_METRIC_COLLECTION_INTERVAL,
+            )
+            .expect("cannot parse default metric collection interval")),
+            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
             test_remote_failures: Set(0),
         }
@@ -342,6 +359,14 @@ impl PageServerConfigBuilder {
         self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
     }
 
+    pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
+        self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
+    }
+
+    pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
+        self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
+    }
+
     pub fn test_remote_failures(&mut self, fail_first: u64) {
         self.test_remote_failures = BuilderValue::Set(fail_first);
     }
@@ -394,6 +419,12 @@ impl PageServerConfigBuilder {
                 .ok_or(anyhow!(
                     "missing concurrent_tenant_size_logical_size_queries"
                 ))?,
+            metric_collection_interval: self
+                .metric_collection_interval
+                .ok_or(anyhow!("missing metric_collection_interval"))?,
+            metric_collection_endpoint: self
+                .metric_collection_endpoint
+                .ok_or(anyhow!("missing metric_collection_endpoint"))?,
             test_remote_failures: self
                 .test_remote_failures
                 .ok_or(anyhow!("missing test_remote_failuers"))?,
@@ -568,6 +599,12 @@ impl PageServerConf {
                     let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
                     ConfigurableSemaphore::new(permits)
                 }),
+                "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
+                "metric_collection_endpoint" => {
+                    let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
+                    builder.metric_collection_endpoint(Some(endpoint));
+                },
+
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
@@ -690,6 +727,8 @@ impl PageServerConf {
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
             concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+            metric_collection_interval: Duration::from_secs(60),
+            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             test_remote_failures: 0,
         }
     }
@@ -821,6 +860,8 @@ max_file_descriptors = 333
 initial_superuser_name = 'zzzz'
 id = 10
 
+metric_collection_interval = '222 s'
+metric_collection_endpoint = 'http://localhost:80/metrics'
 log_format = 'json'
 
 "#;
@@ -864,6 +905,10 @@ log_format = 'json'
                 )?,
                 log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: humantime::parse_duration(
+                    defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
+                )?,
+                metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
                 test_remote_failures: 0,
             },
             "Correct defaults should be used when no config values are provided"
@@ -909,6 +954,8 @@ log_format = 'json'
                 broker_keepalive_interval: Duration::from_secs(5),
                 log_format: LogFormat::Json,
                 concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
+                metric_collection_interval: Duration::from_secs(222),
+                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 test_remote_failures: 0,
             },
             "Should be able to parse all basic config values correctly"
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5c4804db36..626d5e99e3 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,6 @@
 mod auth;
 pub mod basebackup;
+pub mod billing_metrics;
 pub mod config;
 pub mod http;
 pub mod import_datadir;
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 14ab332eba..9253b250cd 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -517,6 +517,10 @@ impl RemoteTimelineClient {
         self.metrics.remote_physical_size_gauge().set(size);
     }
 
+    pub fn get_remote_physical_size(&self) -> u64 {
+        self.metrics.remote_physical_size_gauge().get()
+    }
+
     //
     // Download operations.
     //
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 91719fb3af..fe3ad1a57d 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -203,6 +203,9 @@ pub enum TaskKind {
 
     // task that handles attaching a tenant
     Attach,
+
+    // task that handhes metrics collection
+    MetricsCollection,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7a03f52155..0ff5089f66 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -700,6 +700,22 @@ impl Tenant {
         Ok(())
     }
 
+    /// get size of all remote timelines
+    ///
+    /// This function relies on the index_part instead of listing the remote storage
+    ///
+    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+        let mut size = 0;
+
+        for timeline in self.list_timelines().iter() {
+            if let Some(remote_client) = &timeline.remote_client {
+                size += remote_client.get_remote_physical_size();
+            }
+        }
+
+        Ok(size)
+    }
+
     #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))]
     async fn load_remote_timeline(
         &self,
diff --git a/poetry.lock b/poetry.lock
index 2fa7f03679..f5cbe24954 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"
 
 [package.extras]
-sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -569,7 +569,7 @@ optional = false
 python-versions = ">=3.6.0"
 
 [package.extras]
-unicode-backport = ["unicodedata2"]
+unicode_backport = ["unicodedata2"]
 
 [[package]]
 name = "click"
@@ -747,9 +747,9 @@ python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
+pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements-deprecated-finder = ["pip-api", "pipreqs"]
+requirements_deprecated_finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "itsdangerous"
@@ -824,7 +824,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-testing-libs = ["simplejson", "ujson", "yajl"]
+"testing.libs" = ["simplejson", "ujson", "yajl"]
 
 [[package]]
 name = "jsonpointer"
@@ -850,7 +850,7 @@ six = ">=1.11.0"
 
 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
 
 [[package]]
 name = "junit-xml"
@@ -1227,6 +1227,17 @@ pytest = ">=6.1.0"
 [package.extras]
 testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
 
+[[package]]
+name = "pytest-httpserver"
+version = "1.0.6"
+description = "pytest-httpserver is a httpserver for pytest"
+category = "main"
+optional = false
+python-versions = ">=3.7,<4.0"
+
+[package.dependencies]
+Werkzeug = ">=2.0.0"
+
 [[package]]
 name = "pytest-lazy-fixture"
 version = "0.6.3"
@@ -1350,7 +1361,7 @@ urllib3 = ">=1.21.1,<1.27"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
@@ -1583,7 +1594,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "98d63eaa73253882440e0fc8cdb305bb536944768c5ba313c25d0ee65f546544"
+content-hash = "55aba66810d5b47d25372c740e4d466e1e791c4d0e665c57a611ab8665563689"
 
 [metadata.files]
 aiopg = [
@@ -2099,7 +2110,18 @@ py = [
     {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
+    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
+    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
+    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
+    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
+    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
+    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
+    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
+    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
+    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
+    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
+    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
@@ -2157,6 +2179,10 @@ pytest-asyncio = [
     {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"},
     {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"},
 ]
+pytest-httpserver = [
+    {file = "pytest_httpserver-1.0.6-py3-none-any.whl", hash = "sha256:ac2379acc91fe8bdbe2911c93af8dd130e33b5899fb9934d15669480739c6d32"},
+    {file = "pytest_httpserver-1.0.6.tar.gz", hash = "sha256:9040d07bf59ac45d8de3db1d4468fd2d1d607975e4da4c872ecc0402cdbf7b3e"},
+]
 pytest-lazy-fixture = [
     {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
     {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
diff --git a/pyproject.toml b/pyproject.toml
index b297f7f70b..4819ece4b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ toml = "^0.10.2"
 psutil = "^5.9.4"
 types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
+pytest-httpserver = "^1.0.6"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
new file mode 100644
index 0000000000..7f86d92962
--- /dev/null
+++ b/test_runner/regress/test_metric_collection.py
@@ -0,0 +1,138 @@
+import pytest
+from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PortDistributor,
+    RemoteStorageKind,
+    wait_for_last_flush_lsn,
+)
+from fixtures.types import TenantId, TimelineId
+from fixtures.utils import query_scalar
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
+num_metrics_received = 0
+remote_uploaded = 0
+
+
+#
+# verify that metrics look minilally sane
+#
+def metrics_handler(request: Request) -> Response:
+    if request.json is None:
+        return Response(status=400)
+
+    events = request.json["events"]
+    log.info("received events:")
+    log.info(events)
+
+    checks = {
+        "written_size": lambda value: value > 0,
+        "physical_size": lambda value: value >= 0,
+        # >= 0 check here is to avoid race condition when we receive metrics before
+        # remote_uploaded is updated
+        "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+    }
+
+    for event in events:
+        assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
+
+    assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+
+    global num_metrics_received
+    num_metrics_received += 1
+    return Response(status=200)
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
+)
+def test_metric_collection(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+    remote_storage_kind: RemoteStorageKind,
+):
+    (host, port) = httpserver_listen_address
+    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+
+    # Disable time-based pitr, we will use the manual GC calls
+    # to trigger remote storage operations in a controlled way
+    neon_env_builder.pageserver_config_override = (
+        f"""
+    metric_collection_endpoint="{metric_collection_endpoint}"
+    """
+        + "tenant_config={pitr_interval = '0 sec'}"
+    )
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_metric_collection",
+    )
+
+    log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
+
+    # mock http server that returns OK for the metrics
+    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
+        metrics_handler
+    )
+
+    # spin up neon,  after http server is ready
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_metric_collection")
+    pg = env.postgres.create_start("test_metric_collection")
+
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
+    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # upload some data to remote storage
+    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http = env.pageserver.http_client()
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+        global remote_uploaded
+        remote_uploaded = get_num_remote_ops("index", "upload")
+        assert remote_uploaded > 0
+
+    # check that all requests are served
+    httpserver.check()
+    global num_metrics_received
+    assert num_metrics_received > 0, "no metrics were received"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index de9a26513d..6c81756fe1 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -16,6 +16,7 @@ publish = false
 ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
+chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] }
 clap = { version = "4", features = ["color", "derive", "error-context", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] }
 either = { version = "1", features = ["use_std"] }
@@ -36,9 +37,10 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
+reqwest = { version = "0.11", features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
+serde_json = { version = "1", features = ["raw_value", "std"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 stable_deref_trait = { version = "1", features = ["alloc", "std"] }
 tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }

From 5d4774491f2e22c763dc045a35f799302d146007 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 21 Dec 2022 00:52:07 +0200
Subject: [PATCH 044/132] Exclude macOs fork files from tar processing (#3165)

When running tenant relocation tests, we use
https://github.com/neondatabase/neon/blob/main/scripts/export_import_between_pageservers.py
script to export and import basebackup between pageservers.

When pageserver runs on macOs and reuses the `tar` library for creating
the basebackup archive, it gets the fork files

https://superuser.com/questions/61185/why-do-i-get-files-like-foo-in-my-tarball-on-os-x

We might be able to fix our code to fix the issue, but if we get such
(valid) archive as an input, we
[fail](https://github.com/neondatabase/neon/pull/3013#issuecomment-1360093900).
This does not seem optimal, given that we can ignore such files.
---
 pageserver/src/import_datadir.rs | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 642e41765b..db83bdb3a1 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -440,16 +440,22 @@ fn import_file<Reader: Read>(
     reader: Reader,
     len: usize,
 ) -> Result<Option<ControlFileData>> {
+    let file_name = match file_path.file_name() {
+        Some(name) => name.to_string_lossy(),
+        None => return Ok(None),
+    };
+
+    if file_name.starts_with('.') {
+        // tar archives on macOs, created without COPYFILE_DISABLE=1 env var
+        // will contain "fork files", skip them.
+        return Ok(None);
+    }
+
     if file_path.starts_with("global") {
         let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
         let dbnode = 0;
 
-        match file_path
-            .file_name()
-            .expect("missing filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
             "pg_control" => {
                 let bytes = read_all_bytes(reader)?;
 
@@ -485,12 +491,7 @@ fn import_file<Reader: Read>(
             .to_string_lossy()
             .parse()?;
 
-        match file_path
-            .file_name()
-            .expect("missing base filename")
-            .to_string_lossy()
-            .as_ref()
-        {
+        match file_name.as_ref() {
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader)?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
@@ -520,11 +521,7 @@ fn import_file<Reader: Read>(
         import_slru(modification, slru, file_path, reader, len)?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
-        let file_name = &file_path
-            .file_name()
-            .expect("missing twophase filename")
-            .to_string_lossy();
-        let xid = u32::from_str_radix(file_name, 16)?;
+        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
         let bytes = read_all_bytes(reader)?;
         modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;

From 486a985629c9f8c908153522858469c667ce088d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 21 Dec 2022 09:38:42 +0000
Subject: [PATCH 045/132] mypy: enable check_untyped_defs (#3142)

Enable `check_untyped_defs` and fix warnings.
---
 poetry.lock                                  | 87 ++++++++++----------
 pyproject.toml                               |  6 +-
 scripts/export_import_between_pageservers.py | 18 ++--
 test_runner/fixtures/compare_fixtures.py     |  2 +-
 test_runner/fixtures/neon_fixtures.py        | 18 ++--
 test_runner/performance/test_copy.py         |  3 +-
 test_runner/regress/test_compute_ctl.py      |  6 +-
 test_runner/regress/test_import.py           |  4 +-
 test_runner/regress/test_proxy.py            |  1 +
 test_runner/regress/test_wal_acceptor.py     |  3 +
 10 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f5cbe24954..1b04230cef 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"
 
 [package.extras]
-sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -569,7 +569,7 @@ optional = false
 python-versions = ">=3.6.0"
 
 [package.extras]
-unicode_backport = ["unicodedata2"]
+unicode-backport = ["unicodedata2"]
 
 [[package]]
 name = "click"
@@ -747,9 +747,9 @@ python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements_deprecated_finder = ["pip-api", "pipreqs"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "itsdangerous"
@@ -824,7 +824,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-"testing.libs" = ["simplejson", "ujson", "yajl"]
+testing-libs = ["simplejson", "ujson", "yajl"]
 
 [[package]]
 name = "jsonpointer"
@@ -850,7 +850,7 @@ six = ">=1.11.0"
 
 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
 
 [[package]]
 name = "junit-xml"
@@ -941,11 +941,11 @@ xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
 [[package]]
 name = "mypy"
-version = "0.971"
+version = "0.991"
 description = "Optional static typing for Python"
 category = "dev"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 
 [package.dependencies]
 mypy-extensions = ">=0.4.3"
@@ -954,6 +954,7 @@ typing-extensions = ">=3.10"
 
 [package.extras]
 dmypy = ["psutil (>=4.0)"]
+install-types = ["pip"]
 python2 = ["typed-ast (>=1.4.0,<2)"]
 reports = ["lxml"]
 
@@ -1361,7 +1362,7 @@ urllib3 = ">=1.21.1,<1.27"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
@@ -1594,7 +1595,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "55aba66810d5b47d25372c740e4d466e1e791c4d0e665c57a611ab8665563689"
+content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70"
 
 [metadata.files]
 aiopg = [
@@ -1960,29 +1961,36 @@ moto = [
     {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
 ]
 mypy = [
-    {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"},
-    {file = "mypy-0.971-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:98e02d56ebe93981c41211c05adb630d1d26c14195d04d95e49cd97dbc046dc5"},
-    {file = "mypy-0.971-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:19830b7dba7d5356d3e26e2427a2ec91c994cd92d983142cbd025ebe81d69cf3"},
-    {file = "mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02ef476f6dcb86e6f502ae39a16b93285fef97e7f1ff22932b657d1ef1f28655"},
-    {file = "mypy-0.971-cp310-cp310-win_amd64.whl", hash = "sha256:25c5750ba5609a0c7550b73a33deb314ecfb559c350bb050b655505e8aed4103"},
-    {file = "mypy-0.971-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d3348e7eb2eea2472db611486846742d5d52d1290576de99d59edeb7cd4a42ca"},
-    {file = "mypy-0.971-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3fa7a477b9900be9b7dd4bab30a12759e5abe9586574ceb944bc29cddf8f0417"},
-    {file = "mypy-0.971-cp36-cp36m-win_amd64.whl", hash = "sha256:2ad53cf9c3adc43cf3bea0a7d01a2f2e86db9fe7596dfecb4496a5dda63cbb09"},
-    {file = "mypy-0.971-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:855048b6feb6dfe09d3353466004490b1872887150c5bb5caad7838b57328cc8"},
-    {file = "mypy-0.971-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:23488a14a83bca6e54402c2e6435467a4138785df93ec85aeff64c6170077fb0"},
-    {file = "mypy-0.971-cp37-cp37m-win_amd64.whl", hash = "sha256:4b21e5b1a70dfb972490035128f305c39bc4bc253f34e96a4adf9127cf943eb2"},
-    {file = "mypy-0.971-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9796a2ba7b4b538649caa5cecd398d873f4022ed2333ffde58eaf604c4d2cb27"},
-    {file = "mypy-0.971-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a361d92635ad4ada1b1b2d3630fc2f53f2127d51cf2def9db83cba32e47c856"},
-    {file = "mypy-0.971-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b793b899f7cf563b1e7044a5c97361196b938e92f0a4343a5d27966a53d2ec71"},
-    {file = "mypy-0.971-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d1ea5d12c8e2d266b5fb8c7a5d2e9c0219fedfeb493b7ed60cd350322384ac27"},
-    {file = "mypy-0.971-cp38-cp38-win_amd64.whl", hash = "sha256:23c7ff43fff4b0df93a186581885c8512bc50fc4d4910e0f838e35d6bb6b5e58"},
-    {file = "mypy-0.971-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1f7656b69974a6933e987ee8ffb951d836272d6c0f81d727f1d0e2696074d9e6"},
-    {file = "mypy-0.971-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2022bfadb7a5c2ef410d6a7c9763188afdb7f3533f22a0a32be10d571ee4bbe"},
-    {file = "mypy-0.971-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef943c72a786b0f8d90fd76e9b39ce81fb7171172daf84bf43eaf937e9f220a9"},
-    {file = "mypy-0.971-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d744f72eb39f69312bc6c2abf8ff6656973120e2eb3f3ec4f758ed47e414a4bf"},
-    {file = "mypy-0.971-cp39-cp39-win_amd64.whl", hash = "sha256:77a514ea15d3007d33a9e2157b0ba9c267496acf12a7f2b9b9f8446337aac5b0"},
-    {file = "mypy-0.971-py3-none-any.whl", hash = "sha256:0d054ef16b071149917085f51f89555a576e2618d5d9dd70bd6eea6410af3ac9"},
-    {file = "mypy-0.971.tar.gz", hash = "sha256:40b0f21484238269ae6a57200c807d80debc6459d444c0489a102d7c6a75fa56"},
+    {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
+    {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
+    {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"},
+    {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"},
+    {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"},
+    {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"},
+    {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"},
+    {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"},
+    {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"},
+    {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"},
+    {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"},
+    {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"},
+    {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"},
+    {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"},
+    {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"},
+    {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"},
+    {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"},
+    {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"},
+    {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"},
+    {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"},
+    {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"},
+    {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"},
+    {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"},
+    {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"},
+    {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"},
+    {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"},
+    {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"},
+    {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"},
+    {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"},
+    {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"},
 ]
 mypy-boto3-s3 = [
     {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"},
@@ -2110,18 +2118,7 @@ py = [
     {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
diff --git a/pyproject.toml b/pyproject.toml
index 4819ece4b0..b4fb7a9e7d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,7 +36,7 @@ pytest-httpserver = "^1.0.6"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
-mypy = "==0.971"
+mypy = "==0.991"
 black = "^22.6.0"
 isort = "^5.10.1"
 
@@ -61,10 +61,8 @@ skip = [
 ]
 
 [tool.mypy]
-# mypy uses regex
 exclude = "^vendor/"
-# some tests don't typecheck when this flag is set
-check_untyped_defs = false
+check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
 mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner"
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 1734038661..8ea3f13bf5 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -448,15 +448,15 @@ def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
 
 
 def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
-    conn = psycopg2.connect(pageserver_connstr)
-    conn.autocommit = True
-    with conn.cursor() as cur:
-        cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
-        cur.execute(cmd)
-        res = cur.fetchone()
-        prev_lsn = res[0]
-        last_lsn = res[1]
-    conn.close()
+    with closing(psycopg2.connect(pageserver_connstr)) as conn:
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
+            cur.execute(cmd)
+            res = cur.fetchone()
+            assert res is not None
+            prev_lsn = res[0]
+            last_lsn = res[1]
 
     return last_lsn, prev_lsn
 
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index fa488c4446..be1f146735 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -177,7 +177,7 @@ class VanillaCompare(PgCompare):
         self.cur = self.conn.cursor()
 
     @property
-    def pg(self) -> PgProtocol:
+    def pg(self) -> VanillaPostgres:
         return self._pg
 
     @property
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 287f157d97..d52ca38447 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -623,6 +623,7 @@ class NeonEnvBuilder:
         return self.env
 
     def start(self):
+        assert self.env is not None, "environment is not already initialized, call init() first"
         self.env.start()
 
     def init_start(self) -> NeonEnv:
@@ -751,6 +752,11 @@ class NeonEnvBuilder:
             log.info("no remote storage was set up, skipping cleanup")
             return
 
+        # Making mypy happy with allowing only `S3Storage` further.
+        # `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
+        # so this line effectively a no-op
+        assert isinstance(self.remote_storage, S3Storage)
+
         if self.keep_remote_storage_contents:
             log.info("keep_remote_storage_contents skipping remote storage cleanup")
             return
@@ -766,7 +772,8 @@ class NeonEnvBuilder:
             Prefix=self.remote_storage_prefix,
         )
 
-        objects_to_delete = {"Objects": []}
+        # Using Any because DeleteTypeDef (from boto3-stubs) doesn't fit our case
+        objects_to_delete: Any = {"Objects": []}
         cnt = 0
         for item in pages.search("Contents"):
             # weirdly when nothing is found it returns [None]
@@ -781,16 +788,17 @@ class NeonEnvBuilder:
                     Bucket=self.remote_storage.bucket_name,
                     Delete=objects_to_delete,
                 )
-                objects_to_delete = dict(Objects=[])
+                objects_to_delete = {"Objects": []}
                 cnt += 1
 
         # flush rest
         if len(objects_to_delete["Objects"]):
             self.remote_storage_client.delete_objects(
-                Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete
+                Bucket=self.remote_storage.bucket_name,
+                Delete=objects_to_delete,
             )
 
-        log.info("deleted %s objects from remote storage", cnt)
+        log.info(f"deleted {cnt} objects from remote storage")
 
     def __enter__(self) -> "NeonEnvBuilder":
         return self
@@ -2772,7 +2780,7 @@ class NeonBroker:
         log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
         with open(self.logfile, "wb") as logfile:
             args = [
-                self.neon_binpath / "storage_broker",
+                str(self.neon_binpath / "storage_broker"),
                 f"--listen-addr={listen_addr}",
             ]
             self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py
index 01b2097112..a91c78e867 100644
--- a/test_runner/performance/test_copy.py
+++ b/test_runner/performance/test_copy.py
@@ -1,5 +1,6 @@
 from contextlib import closing
 from io import BufferedReader, RawIOBase
+from typing import Optional
 
 from fixtures.compare_fixtures import PgCompare
 
@@ -8,7 +9,7 @@ class CopyTestData(RawIOBase):
     def __init__(self, rows: int):
         self.rows = rows
         self.rownum = 0
-        self.linebuf = None
+        self.linebuf: Optional[bytes] = None
         self.ptr = 0
 
     def readable(self):
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index 74ee2a89d4..f973bd8e60 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -193,8 +193,8 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             timeout=10,
         )
     except TimeoutExpired as exc:
-        ctl_logs = exc.stderr.decode("utf-8")
-        log.info("compute_ctl output:\n" + ctl_logs)
+        ctl_logs = (exc.stderr or b"").decode("utf-8")
+        log.info("compute_ctl output:\n{ctl_logs}")
 
     with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
         start = "starting safekeepers syncing"
@@ -240,7 +240,7 @@ class ExternalProcessManager:
         with self.pid_file:
             try:
                 os.kill(self.pid, signal.SIGTERM)
-            except os.OsError as e:
+            except OSError as e:
                 if not self.path.is_file():
                     return
                 log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}")
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index fb1bc4839e..0388e24e98 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -53,10 +53,10 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     unpacked_base = os.path.join(basebackup_dir, "unpacked-base")
     corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar")
     os.mkdir(unpacked_base, 0o750)
-    subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base])
+    subprocess_capture(test_output_dir, ["tar", "-xf", base_tar, "-C", unpacked_base])
     os.remove(os.path.join(unpacked_base, "global/pg_control"))
     subprocess_capture(
-        str(test_output_dir),
+        test_output_dir,
         ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base),
         cwd=unpacked_base,
     )
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 4d2b63d360..bcea4d970c 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -71,6 +71,7 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx
 
         log.info("sending session activation message")
         psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info)
+        assert psql.stdout is not None
         out = (await psql.stdout.read()).decode("utf-8").strip()
         assert out == "ok"
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3b72aba422..d88ed319b5 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -883,9 +883,12 @@ class SafekeeperEnv:
             raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}")
 
     def get_safekeeper_connstrs(self):
+        assert self.safekeepers is not None, "safekeepers are not initialized"
         return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers])
 
     def create_postgres(self):
+        assert self.tenant_id is not None, "tenant_id is not initialized"
+        assert self.timeline_id is not None, "tenant_id is not initialized"
         pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata")
         pg = ProposerPostgres(
             pgdata_dir,

From a3f0111726861aa7a758ead2861a66f052bd38b2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 19 Dec 2022 19:43:06 +0100
Subject: [PATCH 046/132] LayerMap::search is actually infallible

Found this while investigating failure modes of on-demand download.

I think it's a nice cleanup.
---
 pageserver/benches/bench_layer_map.rs |  6 +++---
 pageserver/src/tenant/layer_map.rs    | 16 ++++++++--------
 pageserver/src/tenant/timeline.rs     |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6001377811..a0c38e1e3a 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -163,7 +163,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
     c.bench_function("captest_uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
@@ -192,7 +192,7 @@ fn bench_from_real_project(c: &mut Criterion) {
     c.bench_function("real_map_uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
@@ -238,7 +238,7 @@ fn bench_sequential(c: &mut Criterion) {
         // Run the search queries
         b.iter(|| {
             for q in queries.clone().into_iter() {
-                layer_map.search(q.0, q.1).unwrap();
+                layer_map.search(q.0, q.1);
             }
         });
     });
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 19252ecf6e..0202ccfa6a 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -261,7 +261,7 @@ where
     /// contain the version, even if it's missing from the returned
     /// layer.
     ///
-    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult<L>>> {
+    pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
         // linear search
         // Find the latest image layer that covers the given key
         let mut latest_img: Option<Arc<L>> = None;
@@ -286,10 +286,10 @@ where
             assert!(img_lsn < end_lsn);
             if Lsn(img_lsn.0 + 1) == end_lsn {
                 // found exact match
-                return Ok(Some(SearchResult {
+                return Some(SearchResult {
                     layer: Arc::clone(l),
                     lsn_floor: img_lsn,
-                }));
+                });
             }
             if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
                 latest_img = Some(Arc::clone(l));
@@ -346,19 +346,19 @@ where
                 Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
                 l.get_lsn_range().start,
             );
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                 lsn_floor,
                 layer: l,
-            }))
+            })
         } else if let Some(l) = latest_img {
             trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Ok(Some(SearchResult {
+            Some(SearchResult {
                 lsn_floor: latest_img_lsn.unwrap(),
                 layer: l,
-            }))
+            })
         } else {
             trace!("no layer found for request on {key} at {end_lsn}");
-            Ok(None)
+            None
         }
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0697ec4bd6..4a54c91d25 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1587,7 +1587,7 @@ impl Timeline {
                 }
             }
 
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? {
+            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
 
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);

From f637f6e77e035215517603bd8f6f8e74bcb9f675 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Dec 2022 12:20:53 +0100
Subject: [PATCH 047/132] stop exposing non-incremental sizes in API spec

Console doesn't use them, so, don't expose them.

refs https://github.com/neondatabase/cloud/pull/3358
refs https://github.com/neondatabase/cloud/pull/3366
---
 pageserver/src/http/openapi_spec.yml | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 67cf4ea326..f9b8a81dad 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -77,16 +77,6 @@ paths:
         schema:
           type: string
           format: hex
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
-      - name: include-non-incremental-physical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_physical_size_non_incremental
     get:
       description: Get timelines for tenant
       responses:
@@ -139,17 +129,6 @@ paths:
           format: hex
     get:
       description: Get info about the timeline
-      parameters:
-        - name: include-non-incremental-logical-size
-          in: query
-          schema:
-            type: string
-          description: Controls calculation of current_logical_size_non_incremental
-        - name: include-non-incremental-physical-size
-          in: query
-          schema:
-            type: string
-            description: Controls calculation of current_physical_size_non_incremental
       responses:
         "200":
           description: TimelineInfo
@@ -779,10 +758,6 @@ components:
           type: integer
         current_physical_size:
           type: integer
-        current_logical_size_non_incremental:
-          type: integer
-        current_physical_size_non_incremental:
-          type: integer
         wal_source_connstr:
           type: string
         last_received_msg_lsn:

From 91e89371121698e1e0522cc8374d393b94480e65 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 06:38:13 -0500
Subject: [PATCH 048/132] no-op: add Timeline::myself member

---
 pageserver/src/tenant.rs          |  10 +--
 pageserver/src/tenant/timeline.rs | 107 ++++++++++++++++--------------
 2 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0ff5089f66..ce05d8f085 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -480,7 +480,7 @@ impl Tenant {
             let timeline = UninitializedTimeline {
                 owning_tenant: self,
                 timeline_id,
-                raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
+                raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
             };
             // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote
             // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver
@@ -510,7 +510,7 @@ impl Tenant {
                         )
                         })?;
                     broken_timeline.set_state(TimelineState::Broken);
-                    timelines_accessor.insert(timeline_id, Arc::new(broken_timeline));
+                    timelines_accessor.insert(timeline_id, broken_timeline);
                     Err(e)
                 }
             }
@@ -1647,7 +1647,7 @@ impl Tenant {
         new_metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
         if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
             anyhow::ensure!(
                 ancestor.is_some(),
@@ -2209,7 +2209,7 @@ impl Tenant {
                 Ok(UninitializedTimeline {
                     owning_tenant: self,
                     timeline_id: new_timeline_id,
-                    raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
+                    raw_timeline: Some((new_timeline, uninit_mark)),
                 })
             }
             Err(e) => {
@@ -2227,7 +2227,7 @@ impl Tenant {
         new_metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
-    ) -> anyhow::Result<Timeline> {
+    ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_data = self
             .create_timeline_data(
                 new_timeline_id,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a54c91d25..e891caa6f8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -17,7 +17,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
 use crate::storage_sync::index::IndexPart;
@@ -76,6 +76,8 @@ pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<TenantConfOpt>>,
 
+    _myself: Weak<Self>,
+
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
 
@@ -748,75 +750,78 @@ impl Timeline {
         walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
         remote_client: Option<RemoteTimelineClient>,
         pg_version: u32,
-    ) -> Self {
+    ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
         let (state, _) = watch::channel(TimelineState::Suspended);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
-        let mut result = Timeline {
-            conf,
-            tenant_conf,
-            timeline_id,
-            tenant_id,
-            pg_version,
-            layers: RwLock::new(LayerMap::default()),
+        Arc::new_cyclic(|myself| {
+            let mut result = Timeline {
+                conf,
+                tenant_conf,
+                _myself: myself.clone(),
+                timeline_id,
+                tenant_id,
+                pg_version,
+                layers: RwLock::new(LayerMap::default()),
 
-            walredo_mgr,
+                walredo_mgr,
 
-            remote_client: remote_client.map(Arc::new),
+                remote_client: remote_client.map(Arc::new),
 
-            // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
-            last_record_lsn: SeqWait::new(RecordLsn {
-                last: disk_consistent_lsn,
-                prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
-            }),
-            disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),
+                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
+                last_record_lsn: SeqWait::new(RecordLsn {
+                    last: disk_consistent_lsn,
+                    prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
+                }),
+                disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),
 
-            last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
-            last_freeze_ts: RwLock::new(Instant::now()),
+                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
+                last_freeze_ts: RwLock::new(Instant::now()),
 
-            ancestor_timeline: ancestor,
-            ancestor_lsn: metadata.ancestor_lsn(),
+                ancestor_timeline: ancestor,
+                ancestor_lsn: metadata.ancestor_lsn(),
 
-            metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
+                metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
 
-            flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
+                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
 
-            layer_flush_start_tx,
-            layer_flush_done_tx,
+                layer_flush_start_tx,
+                layer_flush_done_tx,
 
-            write_lock: Mutex::new(()),
-            layer_removal_cs: Default::default(),
+                write_lock: Mutex::new(()),
+                layer_removal_cs: Default::default(),
 
-            gc_info: RwLock::new(GcInfo {
-                retain_lsns: Vec::new(),
-                horizon_cutoff: Lsn(0),
-                pitr_cutoff: Lsn(0),
-            }),
+                gc_info: RwLock::new(GcInfo {
+                    retain_lsns: Vec::new(),
+                    horizon_cutoff: Lsn(0),
+                    pitr_cutoff: Lsn(0),
+                }),
 
-            latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
-            initdb_lsn: metadata.initdb_lsn(),
+                latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
+                initdb_lsn: metadata.initdb_lsn(),
 
-            current_logical_size: if disk_consistent_lsn.is_valid() {
-                // we're creating timeline data with some layer files existing locally,
-                // need to recalculate timeline's logical size based on data in the layers.
-                LogicalSize::deferred_initial(disk_consistent_lsn)
-            } else {
-                // we're creating timeline data without any layers existing locally,
-                // initial logical size is 0.
-                LogicalSize::empty_initial()
-            },
-            partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
-            repartition_threshold: 0,
+                current_logical_size: if disk_consistent_lsn.is_valid() {
+                    // we're creating timeline data with some layer files existing locally,
+                    // need to recalculate timeline's logical size based on data in the layers.
+                    LogicalSize::deferred_initial(disk_consistent_lsn)
+                } else {
+                    // we're creating timeline data without any layers existing locally,
+                    // initial logical size is 0.
+                    LogicalSize::empty_initial()
+                },
+                partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                repartition_threshold: 0,
 
-            last_received_wal: Mutex::new(None),
-            rel_size_cache: RwLock::new(HashMap::new()),
-            state,
-        };
-        result.repartition_threshold = result.get_checkpoint_distance() / 10;
-        result
+                last_received_wal: Mutex::new(None),
+                rel_size_cache: RwLock::new(HashMap::new()),
+                state,
+            };
+            result.repartition_threshold = result.get_checkpoint_distance() / 10;
+            result
+        })
     }
 
     pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {

From f5b424b96cee32b80cbee43020628d5feb2e57df Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 7 Dec 2022 12:52:25 -0500
Subject: [PATCH 049/132] no-op: type aliases for Layer::iter and
 Layer::key_iter return types

Not needed by anything right now, but the next commit adds a `Result<>`
around iter() and key_iter()'s return types, and that makes clippy
complain.
---
 pageserver/src/tenant/delta_layer.rs   |  6 +++---
 pageserver/src/tenant/image_layer.rs   |  6 +++---
 pageserver/src/tenant/storage_layer.rs | 10 ++++++++--
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index d8aaa3e8b9..cff819d878 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -55,7 +55,7 @@ use utils::{
 };
 
 use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::storage_layer::{Layer, LayerIter, LayerKeyIter};
 
 ///
 /// Header stored in the beginning of the file
@@ -391,7 +391,7 @@ impl PersistentLayer for DeltaLayer {
         self.path()
     }
 
-    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
+    fn iter(&self) -> LayerIter<'_> {
         let inner = match self.load() {
             Ok(inner) => inner,
             Err(e) => panic!("Failed to load a delta layer: {e:?}"),
@@ -403,7 +403,7 @@ impl PersistentLayer for DeltaLayer {
         }
     }
 
-    fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
+    fn key_iter(&self) -> LayerKeyIter<'_> {
         let inner = match self.load() {
             Ok(inner) => inner,
             Err(e) => panic!("Failed to load a delta layer: {e:?}"),
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index e08e938a4f..fe9de855e7 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -21,7 +21,7 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::page_cache::PAGE_SZ;
-use crate::repository::{Key, Value, KEY_SIZE};
+use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
@@ -51,7 +51,7 @@ use utils::{
 };
 
 use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::storage_layer::{Layer, LayerIter};
 
 ///
 /// Header stored in the beginning of the file
@@ -219,7 +219,7 @@ impl PersistentLayer for ImageLayer {
     fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
+    fn iter(&self) -> LayerIter<'_> {
         unimplemented!();
     }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3ad62587d3..82c25c063b 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -116,6 +116,12 @@ pub trait Layer: Send + Sync {
     fn dump(&self, verbose: bool) -> Result<()>;
 }
 
+/// Returned by [`Layer::iter`]
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
+
+/// Returned by [`Layer::key_iter`]
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
+
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -144,11 +150,11 @@ pub trait PersistentLayer: Layer {
     fn local_path(&self) -> PathBuf;
 
     /// Iterate through all keys and values stored in the layer
-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
+    fn iter(&self) -> LayerIter<'_>;
 
     /// Iterate through all keys stored in the layer. Returns key, lsn and value size
     /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
+    fn key_iter(&self) -> LayerKeyIter<'_> {
         panic!("Not implemented")
     }
 

From e94b4514301501ae43330c38e7467b88901c7dd8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 06:43:26 -0500
Subject: [PATCH 050/132] no-op: storage_layer::Iter::{iter, key_iter}: make
 them fallible

---
 pageserver/src/tenant/delta_layer.rs    | 27 ++++--------
 pageserver/src/tenant/image_layer.rs    |  2 +-
 pageserver/src/tenant/inmemory_layer.rs |  1 +
 pageserver/src/tenant/storage_layer.rs  |  4 +-
 pageserver/src/tenant/timeline.rs       | 56 +++++++++++++------------
 5 files changed, 42 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index cff819d878..a252abf2a0 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -391,28 +391,19 @@ impl PersistentLayer for DeltaLayer {
         self.path()
     }
 
-    fn iter(&self) -> LayerIter<'_> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaValueIter::new(inner) {
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        let inner = self.load().context("load delta layer")?;
+        Ok(match DeltaValueIter::new(inner) {
             Ok(iter) => Box::new(iter),
             Err(err) => Box::new(std::iter::once(Err(err))),
-        }
+        })
     }
 
-    fn key_iter(&self) -> LayerKeyIter<'_> {
-        let inner = match self.load() {
-            Ok(inner) => inner,
-            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
-        };
-
-        match DeltaKeyIter::new(inner) {
-            Ok(iter) => Box::new(iter),
-            Err(e) => panic!("Layer index is corrupted: {e:?}"),
-        }
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        let inner = self.load()?;
+        Ok(Box::new(
+            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
+        ))
     }
 
     fn delete(&self) -> Result<()> {
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index fe9de855e7..c907d21af5 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -219,7 +219,7 @@ impl PersistentLayer for ImageLayer {
     fn get_timeline_id(&self) -> TimelineId {
         self.timeline_id
     }
-    fn iter(&self) -> LayerIter<'_> {
+    fn iter(&self) -> Result<LayerIter<'_>> {
         unimplemented!();
     }
 
diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs
index 8f64281cb1..35b0e98591 100644
--- a/pageserver/src/tenant/inmemory_layer.rs
+++ b/pageserver/src/tenant/inmemory_layer.rs
@@ -97,6 +97,7 @@ impl Layer for InMemoryLayer {
         };
         self.start_lsn..end_lsn
     }
+
     fn is_incremental(&self) -> bool {
         // in-memory layer is always considered incremental.
         true
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 82c25c063b..ba0311574d 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -150,11 +150,11 @@ pub trait PersistentLayer: Layer {
     fn local_path(&self) -> PathBuf;
 
     /// Iterate through all keys and values stored in the layer
-    fn iter(&self) -> LayerIter<'_>;
+    fn iter(&self) -> Result<LayerIter<'_>>;
 
     /// Iterate through all keys stored in the layer. Returns key, lsn and value size
     /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self) -> LayerKeyIter<'_> {
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
         panic!("Not implemented")
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e891caa6f8..34cb01cdd8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2195,38 +2195,40 @@ impl Timeline {
 
         // This iterator walks through all key-value pairs from all the layers
         // we're compacting, in key, LSN order.
-        let all_values_iter = deltas_to_compact
-            .iter()
-            .map(|l| l.iter())
-            .kmerge_by(|a, b| {
-                if let Ok((a_key, a_lsn, _)) = a {
-                    if let Ok((b_key, b_lsn, _)) = b {
-                        match a_key.cmp(b_key) {
-                            Ordering::Less => true,
-                            Ordering::Equal => a_lsn <= b_lsn,
-                            Ordering::Greater => false,
+        let all_values_iter =
+            itertools::process_results(deltas_to_compact.iter().map(|l| l.iter()), |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    if let Ok((a_key, a_lsn, _)) = a {
+                        if let Ok((b_key, b_lsn, _)) = b {
+                            match a_key.cmp(b_key) {
+                                Ordering::Less => true,
+                                Ordering::Equal => a_lsn <= b_lsn,
+                                Ordering::Greater => false,
+                            }
+                        } else {
+                            false
                         }
                     } else {
-                        false
+                        true
                     }
-                } else {
-                    true
-                }
-            });
+                })
+            })?;
 
         // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = deltas_to_compact
-            .iter()
-            .map(|l| l.key_iter())
-            .kmerge_by(|a, b| {
-                let (a_key, a_lsn, _) = a;
-                let (b_key, b_lsn, _) = b;
-                match a_key.cmp(b_key) {
-                    Ordering::Less => true,
-                    Ordering::Equal => a_lsn <= b_lsn,
-                    Ordering::Greater => false,
-                }
-            });
+        let mut all_keys_iter = itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.key_iter()),
+            |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    match a_key.cmp(b_key) {
+                        Ordering::Less => true,
+                        Ordering::Equal => a_lsn <= b_lsn,
+                        Ordering::Greater => false,
+                    }
+                })
+            },
+        )?;
 
         // Merge the contents of all the input delta layers into a new set
         // of delta layers, based on the current partitioning.

From 749a2f00d71750a9055b21e8b565a3e9b51eecd5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 06:45:10 -0500
Subject: [PATCH 051/132] no-op: distinguished error types for
 Timeline::get_reconstruct_data

---
 pageserver/src/tenant/timeline.rs | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 34cb01cdd8..0b31c9bdc4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1483,7 +1483,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), PageReconstructError> {
         // Start from the current timeline.
         let mut timeline_owned;
         let mut timeline = self;
@@ -2828,12 +2828,31 @@ impl Timeline {
     }
 }
 
+/// An error happened in a get() operation.
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
+
+    #[error(transparent)]
+    WalRedo(#[from] crate::walredo::WalRedoError),
+}
+
+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        match self {
+            PageReconstructError::Other(err) => err.fmt(f),
+            PageReconstructError::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
 /// Helper function for get_reconstruct_data() to add the path of layers traversed
 /// to an error, as anyhow context information.
 fn layer_traversal_error(
     msg: String,
     path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
-) -> anyhow::Result<()> {
+) -> Result<(), PageReconstructError> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
     let mut msg_iter = path
@@ -2849,7 +2868,8 @@ fn layer_traversal_error(
     let err = anyhow!(msg_iter.next().unwrap());
 
     // Append all subsequent traversals, and the error message 'msg', as contexts.
-    Err(msg_iter.fold(err, |err, msg| err.context(msg)))
+    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
+    Err(PageReconstructError::Other(msg))
 }
 
 /// Various functions to mutate the timeline.

From 24609873287823a7d208e14cd227041c3c0bcc12 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 5 Dec 2022 09:38:41 -0500
Subject: [PATCH 052/132] no-op: pgdatadir_mapping: qualified use of
 anyhow::Result

---
 pageserver/src/pgdatadir_mapping.rs | 107 ++++++++++++++++------------
 1 file changed, 63 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 797ee9f436..7b4b05ed18 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,7 +10,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{self, bail, ensure, Context};
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -97,7 +97,7 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
-    ) -> Result<Bytes> {
+    ) -> anyhow::Result<Bytes> {
         ensure!(tag.relnode != 0, "invalid relnode");
 
         let nblocks = self.get_rel_size(tag, lsn, latest)?;
@@ -114,7 +114,13 @@ impl Timeline {
     }
 
     // Get size of a database in blocks
-    pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
+    pub fn get_db_size(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+        latest: bool,
+    ) -> anyhow::Result<usize> {
         let mut total_blocks = 0;
 
         let rels = self.list_rels(spcnode, dbnode, lsn)?;
@@ -127,7 +133,7 @@ impl Timeline {
     }
 
     /// Get size of a relation file
-    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
+    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> anyhow::Result<BlockNumber> {
         ensure!(tag.relnode != 0, "invalid relnode");
 
         if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -162,7 +168,7 @@ impl Timeline {
     }
 
     /// Does relation exist?
-    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
+    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> anyhow::Result<bool> {
         ensure!(tag.relnode != 0, "invalid relnode");
 
         // first try to lookup relation in cache
@@ -180,7 +186,12 @@ impl Timeline {
     }
 
     /// Get a list of all existing relations in given tablespace and database.
-    pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
+    pub fn list_rels(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> anyhow::Result<HashSet<RelTag>> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
         let buf = self.get(key, lsn)?;
@@ -204,7 +215,7 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
-    ) -> Result<Bytes> {
+    ) -> anyhow::Result<Bytes> {
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn)
     }
@@ -215,14 +226,19 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
-    ) -> Result<BlockNumber> {
+    ) -> anyhow::Result<BlockNumber> {
         let key = slru_segment_size_to_key(kind, segno);
         let mut buf = self.get(key, lsn)?;
         Ok(buf.get_u32_le())
     }
 
     /// Get size of an SLRU segment
-    pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
+    pub fn get_slru_segment_exists(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+    ) -> anyhow::Result<bool> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
         let buf = self.get(key, lsn)?;
@@ -239,7 +255,10 @@ impl Timeline {
     /// so it's not well defined which LSN you get if there were multiple commits
     /// "in flight" at that point in time.
     ///
-    pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
+    pub fn find_lsn_for_timestamp(
+        &self,
+        search_timestamp: TimestampTz,
+    ) -> anyhow::Result<LsnForTimestamp> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
         let max_lsn = self.get_last_record_lsn();
@@ -308,7 +327,7 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
-    ) -> Result<bool> {
+    ) -> anyhow::Result<bool> {
         for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
             let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
             for blknum in (0..nblocks).rev() {
@@ -333,7 +352,7 @@ impl Timeline {
     }
 
     /// Get a list of SLRU segments
-    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
+    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> anyhow::Result<HashSet<u32>> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
@@ -343,14 +362,14 @@ impl Timeline {
         Ok(dir.segments)
     }
 
-    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> anyhow::Result<Bytes> {
         let key = relmap_file_key(spcnode, dbnode);
 
         let buf = self.get(key, lsn)?;
         Ok(buf)
     }
 
-    pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
+    pub fn list_dbdirs(&self, lsn: Lsn) -> anyhow::Result<HashMap<(Oid, Oid), bool>> {
         // fetch directory entry
         let buf = self.get(DBDIR_KEY, lsn)?;
         let dir = DbDirectory::des(&buf)?;
@@ -358,13 +377,13 @@ impl Timeline {
         Ok(dir.dbdirs)
     }
 
-    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<Bytes> {
         let key = twophase_file_key(xid);
         let buf = self.get(key, lsn)?;
         Ok(buf)
     }
 
-    pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
+    pub fn list_twophase_files(&self, lsn: Lsn) -> anyhow::Result<HashSet<TransactionId>> {
         // fetch directory entry
         let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
         let dir = TwoPhaseDirectory::des(&buf)?;
@@ -372,11 +391,11 @@ impl Timeline {
         Ok(dir.xids)
     }
 
-    pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_control_file(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
         self.get(CONTROLFILE_KEY, lsn)
     }
 
-    pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_checkpoint(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
         self.get(CHECKPOINT_KEY, lsn)
     }
 
@@ -414,7 +433,7 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
+    pub fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -553,7 +572,7 @@ impl<'a> DatadirModification<'a> {
     ///
     /// This inserts the directory metadata entries that are assumed to
     /// always exist.
-    pub fn init_empty(&mut self) -> Result<()> {
+    pub fn init_empty(&mut self) -> anyhow::Result<()> {
         let buf = DbDirectory::ser(&DbDirectory {
             dbdirs: HashMap::new(),
         })?;
@@ -586,7 +605,7 @@ impl<'a> DatadirModification<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
@@ -599,7 +618,7 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         self.put(
             slru_block_to_key(kind, segno, blknum),
             Value::WalRecord(rec),
@@ -613,7 +632,7 @@ impl<'a> DatadirModification<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
@@ -625,13 +644,13 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
         Ok(())
     }
 
     /// Store a relmapper file (pg_filenode.map) in the repository
-    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
+    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
         let buf = self.get(DBDIR_KEY)?;
         let mut dbdir = DbDirectory::des(&buf)?;
@@ -659,7 +678,7 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> {
+    pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory entry
         let buf = self.get(TWOPHASEDIR_KEY)?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
@@ -675,17 +694,17 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn put_control_file(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
         self.put(CHECKPOINT_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
+    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
         let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
@@ -714,7 +733,7 @@ impl<'a> DatadirModification<'a> {
     /// Create a relation fork.
     ///
     /// 'nblocks' is the initial size.
-    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
@@ -758,7 +777,7 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Truncate relation
-    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
         if self.tline.get_rel_exists(rel, last_lsn, true)? {
@@ -784,7 +803,7 @@ impl<'a> DatadirModification<'a> {
 
     /// Extend relation
     /// If new size is smaller, do nothing.
-    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
@@ -805,7 +824,7 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relation.
-    pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
+    pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
         ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
@@ -838,7 +857,7 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key)?;
@@ -868,7 +887,7 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Put size
         let size_key = slru_segment_size_to_key(kind, segno);
         let buf = nblocks.to_le_bytes();
@@ -877,7 +896,7 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> {
+    pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key)?;
@@ -898,13 +917,13 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
         // TODO
         Ok(())
     }
 
     /// This method is used for marking truncated SLRU files
-    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> {
+    pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let buf = self.get(TWOPHASEDIR_KEY)?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
@@ -941,7 +960,7 @@ impl<'a> DatadirModification<'a> {
     /// retains all the metadata, but data pages are flushed. That's again OK
     /// for bulk import, where you are just loading data pages and won't try to
     /// modify the same pages twice.
-    pub fn flush(&mut self) -> Result<()> {
+    pub fn flush(&mut self) -> anyhow::Result<()> {
         // Unless we have accumulated a decent amount of changes, it's not worth it
         // to scan through the pending_updates list.
         let pending_nblocks = self.pending_nblocks;
@@ -952,7 +971,7 @@ impl<'a> DatadirModification<'a> {
         let writer = self.tline.writer();
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut result: Result<()> = Ok(());
+        let mut result: anyhow::Result<()> = Ok(());
         self.pending_updates.retain(|&key, value| {
             if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
                 result = writer.put(key, self.lsn, value);
@@ -1000,7 +1019,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    fn get(&self, key: Key) -> Result<Bytes> {
+    fn get(&self, key: Key) -> anyhow::Result<Bytes> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1370,7 +1389,7 @@ const CHECKPOINT_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 
-pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
             RelTag {
@@ -1400,7 +1419,7 @@ pub fn is_rel_vm_block_key(key: Key) -> bool {
         && key.field6 != 0xffffffff
 }
 
-pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
     Ok(match key.field1 {
         0x01 => {
             let kind = match key.field2 {
@@ -1429,7 +1448,7 @@ pub fn create_test_timeline(
     tenant: &crate::tenant::Tenant,
     timeline_id: utils::id::TimelineId,
     pg_version: u32,
-) -> Result<std::sync::Arc<Timeline>> {
+) -> anyhow::Result<std::sync::Arc<Timeline>> {
     let tline = tenant
         .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
         .initialize()?;

From 1da03141a74a649f35dc7f9992e050bb9f4d9db3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 12 Dec 2022 15:16:08 -0500
Subject: [PATCH 053/132] refactor: make Layer::local_path return
 Option<PathBuf> instead of PathBuf

This is in preparation for RemoteLayer, which by definition doesn't have
a local path.
---
 pageserver/src/tenant/delta_layer.rs   |  4 +-
 pageserver/src/tenant/image_layer.rs   |  4 +-
 pageserver/src/tenant/storage_layer.rs |  3 +-
 pageserver/src/tenant/timeline.rs      | 54 +++++++++++++++++---------
 4 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index a252abf2a0..e1006dfe00 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -387,8 +387,8 @@ impl PersistentLayer for DeltaLayer {
         self.layer_name().into()
     }
 
-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
     }
 
     fn iter(&self) -> Result<LayerIter<'_>> {
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index c907d21af5..b1dbbfb683 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -208,8 +208,8 @@ impl PersistentLayer for ImageLayer {
         self.layer_name().into()
     }
 
-    fn local_path(&self) -> PathBuf {
-        self.path()
+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
     }
 
     fn get_tenant_id(&self) -> TenantId {
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index ba0311574d..79eaa96591 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -147,7 +147,8 @@ pub trait PersistentLayer: Layer {
     fn filename(&self) -> LayerFileName;
 
     // Path to the layer file in the local filesystem.
-    fn local_path(&self) -> PathBuf;
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<PathBuf>;
 
     /// Iterate through all keys and values stored in the layer
     fn iter(&self) -> Result<LayerIter<'_>>;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0b31c9bdc4..59d3486644 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1031,11 +1031,13 @@ impl Timeline {
                 .unwrap_or(LayerFileMetadata::MISSING);
 
             // Is the local layer's size different from the size stored in the
-            // remote index file? If so, rename_to_backup those files & remove
-            // local_layer form the layer map.
-            // We'll download a fresh copy of the layer file below.
+            // remote index file?
+            // If so, rename_to_backup those files & replace their local layer with
+            // a RemoteLayer in the laye rmap so that we re-download them on-demand.
             if let Some(local_layer) = local_layer {
-                let local_layer_path = local_layer.local_path();
+                let local_layer_path = local_layer
+                    .local_path()
+                    .expect("caller must ensure that local_layers only contains local layers");
                 ensure!(
                     local_layer_path.exists(),
                     "every layer from local_layers must exist on disk: {}",
@@ -1210,7 +1212,10 @@ impl Timeline {
 
         // Are there local files that don't exist remotely? Schedule uploads for them
         for (layer_name, layer) in &local_only_layers {
-            let layer_path = layer.local_path();
+            // XXX solve this in the type system
+            let layer_path = layer
+                .local_path()
+                .expect("local_only_layers only contains local layers");
             let layer_size = layer_path
                 .metadata()
                 .with_context(|| format!("failed to get file {layer_path:?} metadata"))?
@@ -1450,12 +1455,21 @@ trait TraversalLayerExt {
 
 impl TraversalLayerExt for Arc<dyn PersistentLayer> {
     fn traversal_id(&self) -> String {
-        debug_assert!(
-            self.local_path().to_str().unwrap()
-                .contains(&format!("{}", self.get_timeline_id())),
-            "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
-        );
-        format!("{}", self.local_path().display())
+        match self.local_path() {
+            Some(local_path) => {
+                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
+                    "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
+                );
+                format!("{}", local_path.display())
+            }
+            None => {
+                format!(
+                    "remote {}/{}",
+                    self.get_timeline_id(),
+                    self.filename().file_name()
+                )
+            }
+        }
     }
 }
 
@@ -2440,10 +2454,11 @@ impl Timeline {
         // delete the old ones
         let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
         for l in deltas_to_compact {
-            let path = l.local_path();
-            self.metrics
-                .current_physical_size_gauge
-                .sub(path.metadata()?.len());
+            if let Some(path) = l.local_path() {
+                self.metrics
+                    .current_physical_size_gauge
+                    .sub(path.metadata()?.len());
+            }
             layer_names_to_delete.push(l.filename());
             l.delete()?;
             layers.remove_historic(l);
@@ -2726,10 +2741,11 @@ impl Timeline {
             // while iterating it. BTreeMap::retain() would be another option)
             let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len());
             for doomed_layer in layers_to_remove {
-                let path = doomed_layer.local_path();
-                self.metrics
-                    .current_physical_size_gauge
-                    .sub(path.metadata()?.len());
+                if let Some(path) = doomed_layer.local_path() {
+                    self.metrics
+                        .current_physical_size_gauge
+                        .sub(path.metadata()?.len());
+                }
                 layer_names_to_delete.push(doomed_layer.filename());
                 doomed_layer.delete()?;
                 layers.remove_historic(doomed_layer);

From 31543c4acc330060712036ae22651995c2b29a28 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 9 Dec 2022 11:20:04 -0500
Subject: [PATCH 054/132] refactor: make update_gc_info and transitive callers
 async

This is so that in the next commit, we can add a retry_get to
find_lsn_for_timestamp.
---
 pageserver/src/tenant.rs          | 134 +++++++++++++++++++-----------
 pageserver/src/tenant/size.rs     |   1 +
 pageserver/src/tenant/timeline.rs |   6 +-
 3 files changed, 88 insertions(+), 53 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce05d8f085..799a34fb3b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -129,7 +129,7 @@ pub struct Tenant {
     // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
     // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
     // timeout...
-    gc_cs: Mutex<()>,
+    gc_cs: tokio::sync::Mutex<()>,
     walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
 
     // provides access to timeline data sitting in the remote storage
@@ -1158,7 +1158,8 @@ impl Tenant {
                     ancestor_timeline.wait_lsn(*lsn).await?;
                 }
 
-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
+                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                    .await?
             }
             None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
         };
@@ -1683,7 +1684,7 @@ impl Tenant {
             conf,
             tenant_conf: Arc::new(RwLock::new(tenant_conf)),
             timelines: Mutex::new(HashMap::new()),
-            gc_cs: Mutex::new(()),
+            gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
             state,
@@ -1834,7 +1835,9 @@ impl Tenant {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
-        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
+        let gc_timelines = self
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await?;
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
@@ -1869,7 +1872,7 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -1880,54 +1883,60 @@ impl Tenant {
         let target_timeline_id = None;
 
         self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .await
     }
 
-    fn refresh_gc_info_internal(
+    async fn refresh_gc_info_internal(
         &self,
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // grab mutex to prevent new timelines from being created here.
-        let gc_cs = self.gc_cs.lock().unwrap();
-
-        let timelines = self.timelines.lock().unwrap();
+        let gc_cs = self.gc_cs.lock().await;
 
         // Scan all timelines. For each timeline, remember the timeline ID and
         // the branch point where it was created.
-        let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new();
-        let timeline_ids = {
-            if let Some(target_timeline_id) = target_timeline_id.as_ref() {
-                if timelines.get(target_timeline_id).is_none() {
-                    bail!("gc target timeline does not exist")
-                }
-            };
+        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+            let timelines = self.timelines.lock().unwrap();
+            let mut all_branchpoints = BTreeSet::new();
+            let timeline_ids = {
+                if let Some(target_timeline_id) = target_timeline_id.as_ref() {
+                    if timelines.get(target_timeline_id).is_none() {
+                        bail!("gc target timeline does not exist")
+                    }
+                };
 
-            timelines
-                .iter()
-                .map(|(timeline_id, timeline_entry)| {
-                    if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
-                        // If target_timeline is specified, we only need to know branchpoints of its children
-                        if let Some(timeline_id) = target_timeline_id {
-                            if ancestor_timeline_id == &timeline_id {
+                timelines
+                    .iter()
+                    .map(|(timeline_id, timeline_entry)| {
+                        if let Some(ancestor_timeline_id) =
+                            &timeline_entry.get_ancestor_timeline_id()
+                        {
+                            // If target_timeline is specified, we only need to know branchpoints of its children
+                            if let Some(timeline_id) = target_timeline_id {
+                                if ancestor_timeline_id == &timeline_id {
+                                    all_branchpoints.insert((
+                                        *ancestor_timeline_id,
+                                        timeline_entry.get_ancestor_lsn(),
+                                    ));
+                                }
+                            }
+                            // Collect branchpoints for all timelines
+                            else {
                                 all_branchpoints.insert((
                                     *ancestor_timeline_id,
                                     timeline_entry.get_ancestor_lsn(),
                                 ));
                             }
                         }
-                        // Collect branchpoints for all timelines
-                        else {
-                            all_branchpoints
-                                .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn()));
-                        }
-                    }
 
-                    *timeline_id
-                })
-                .collect::<Vec<_>>()
+                        *timeline_id
+                    })
+                    .collect::<Vec<_>>()
+            };
+            (all_branchpoints, timeline_ids)
         };
-        drop(timelines);
 
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
@@ -1953,7 +1962,7 @@ impl Tenant {
                     ))
                     .map(|&x| x.1)
                     .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr)?;
+                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
 
                 gc_timelines.push(timeline);
             }
@@ -1963,7 +1972,7 @@ impl Tenant {
     }
 
     /// Branch an existing timeline
-    fn branch_timeline(
+    async fn branch_timeline(
         &self,
         src: TimelineId,
         dst: TimelineId,
@@ -1972,10 +1981,11 @@ impl Tenant {
         // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
         // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
         // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().unwrap();
-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?;
-        drop(timelines);
+        let _gc_cs = self.gc_cs.lock().await;
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst, &timelines)?
+        };
 
         // In order for the branch creation task to not wait for GC/compaction,
         // we need to make sure that the starting LSN of the child branch is not out of scope midway by
@@ -2837,7 +2847,9 @@ mod tests {
         //assert_current_logical_size(&tline, Lsn(0x40));
 
         // Branch the history, modify relation differently on the new timeline
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -2925,7 +2937,10 @@ mod tests {
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
                 assert!(err.to_string().contains("invalid branch start lsn"));
@@ -2950,7 +2965,10 @@ mod tests {
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
             .initialize()?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
-        match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
+        match tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .await
+        {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
                 assert!(&err.to_string().contains("invalid branch start lsn"));
@@ -2998,7 +3016,9 @@ mod tests {
             .initialize()?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -3020,7 +3040,9 @@ mod tests {
             .initialize()?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-        tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+        tenant
+            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
@@ -3074,7 +3096,9 @@ mod tests {
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
-            tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
+            tenant
+                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .await?;
 
             let newtline = tenant
                 .get_timeline(NEW_TIMELINE_ID, true)
@@ -3225,7 +3249,9 @@ mod tests {
 
             let cutoff = tline.get_last_record_lsn();
 
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
             tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
@@ -3296,7 +3322,9 @@ mod tests {
 
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
             tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
@@ -3345,7 +3373,9 @@ mod tests {
         let mut tline_id = TIMELINE_ID;
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
@@ -3378,7 +3408,9 @@ mod tests {
 
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
-            tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
+            tline
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .await?;
             tline.freeze_and_flush().await?;
             tline.compact().await?;
             tline.gc().await?;
@@ -3409,7 +3441,9 @@ mod tests {
         #[allow(clippy::needless_range_loop)]
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
-            tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
+            tenant
+                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 597461ce29..5ce0837562 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -70,6 +70,7 @@ pub(super) async fn gather_inputs(
 
     let timelines = tenant
         .refresh_gc_info()
+        .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
     if timelines.is_empty() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 59d3486644..61d619a17b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -160,7 +160,7 @@ pub struct Timeline {
 
     // List of child timelines and their branch points. This is needed to avoid
     // garbage collecting data that is still needed by the child timelines.
-    pub gc_info: RwLock<GcInfo>,
+    pub gc_info: std::sync::RwLock<GcInfo>,
 
     // It may change across major versions so for simplicity
     // keep it after running initdb for a timeline.
@@ -794,7 +794,7 @@ impl Timeline {
                 write_lock: Mutex::new(()),
                 layer_removal_cs: Default::default(),
 
-                gc_info: RwLock::new(GcInfo {
+                gc_info: std::sync::RwLock::new(GcInfo {
                     retain_lsns: Vec::new(),
                     horizon_cutoff: Lsn(0),
                     pitr_cutoff: Lsn(0),
@@ -2499,7 +2499,7 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
-    pub(super) fn update_gc_info(
+    pub(super) async fn update_gc_info(
         &self,
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,

From 7ff591ffbfc8f084e0b1b5cdbed8bd69e008d4c0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 5 Dec 2022 10:20:24 -0500
Subject: [PATCH 055/132] On-Demand Download
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code in this change was extracted from #2595 (Heikki’s on-demand
download draft PR).

High-Level Changes

- New RemoteLayer Type
- On-Demand Download As An Effect Of Page Reconstruction
- Breaking Semantics For Physical Size Metrics

There are several follow-up work items planned.
Refer to the Epic issue on GitHub: https://github.com/neondatabase/neon/issues/2029

closes https://github.com/neondatabase/neon/pull/3013

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>

New RemoteLayer Type
====================

Instead of downloading all layers during tenant attach, we create
RemoteLayer instances for each of them and add them to the layer map.

On-Demand Download As An Effect Of Page Reconstruction
======================================================

At the heart of pageserver is Timeline::get_reconstruct_data(). It
traverses the layer map until it has collected all the data it needs to
produce the page image. Most code in the code base uses it, though many
layers of indirection.

Before this patch, the function would use synchronous filesystem IO to
load data from disk-resident layer files if the data was not cached.

That is not possible with RemoteLayer, because the layer file has not
been downloaded yet. So, we do the download when get_reconstruct_data
gets there, i.e., “on demand”.

The mechanics of how the download is done are rather involved, because
of the infamous async-sync-async sandwich problem that plagues the async
Rust world. We use the new PageReconstructResult type to work around
this. Its introduction is the cause for a good amount of code churn in
this patch. Refer to the block comment on `with_ondemand_download()`
for details.

Breaking Semantics For Physical Size Metrics
============================================

We rename prometheus metric pageserver_{current,resident}_physical_size to
reflect what this metric actually represents with on-demand download.
This intentionally BREAKS existing grafana dashboard and the cost model data
pipeline. Breaking is desirable because the meaning of this metrics has changed
with on-demand download. See
 https://docs.google.com/document/d/12AFpvKY-7FZdR5a4CaD6Ir_rI3QokdCLSPJ6upHxJBo/edit#
for how we will handle this breakage.

Likewise, we rename the new billing_metrics’s PhysicalSize => ResidentSize.
This is not yet used anywhere, so, this is not a breaking change.

There is still a field called TimelineInfo::current_physical_size. It
is now the sum of the layer sizes in layer map, regardless of whether
local or remote. To compute that sum, we added a new trait method
PersistentLayer::file_size().

When updating the Python tests, we got rid of
current_physical_size_non_incremental. An earlier commit removed it from
the OpenAPI spec already, so this is not a breaking change.

test_timeline_size.py has grown additional assertions on the
resident_physical_size metric.
---
 libs/pageserver_api/src/models.rs             |  23 +-
 pageserver/src/basebackup.rs                  |  65 +-
 pageserver/src/billing_metrics.rs             |  28 +-
 pageserver/src/http/routes.rs                 | 114 ++-
 pageserver/src/import_datadir.rs              |  14 +-
 pageserver/src/lib.rs                         |   2 +-
 pageserver/src/metrics.rs                     |  22 +-
 pageserver/src/page_service.rs                |  30 +-
 pageserver/src/pgdatadir_mapping.rs           | 311 ++++---
 pageserver/src/storage_sync2.rs               |  46 +-
 pageserver/src/storage_sync2/download.rs      |   4 +
 pageserver/src/task_mgr.rs                    |  18 +
 pageserver/src/tenant.rs                      |  62 +-
 pageserver/src/tenant/delta_layer.rs          |  25 +-
 pageserver/src/tenant/image_layer.rs          |  25 +-
 pageserver/src/tenant/remote_layer.rs         | 212 +++++
 pageserver/src/tenant/size.rs                 |   2 -
 pageserver/src/tenant/storage_layer.rs        |  27 +
 pageserver/src/tenant/timeline.rs             | 832 ++++++++++++++----
 pageserver/src/virtual_file.rs                |   6 +-
 pageserver/src/walingest.rs                   | 470 +++++++---
 .../src/walreceiver/connection_manager.rs     |   2 +-
 .../src/walreceiver/walreceiver_connection.rs |  17 +-
 pageserver/src/walrecord.rs                   |   1 +
 scripts/export_import_between_pageservers.py  |  10 +-
 test_runner/fixtures/metrics.py               |   2 +-
 test_runner/fixtures/neon_fixtures.py         | 172 +++-
 test_runner/regress/test_broken_timeline.py   |  11 +-
 test_runner/regress/test_metric_collection.py |   2 +-
 test_runner/regress/test_ondemand_download.py | 437 +++++++++
 test_runner/regress/test_remote_storage.py    |  69 +-
 test_runner/regress/test_tenant_relocation.py |  25 +-
 test_runner/regress/test_tenant_tasks.py      |  34 +-
 .../test_tenants_with_remote_storage.py       |  34 +-
 test_runner/regress/test_timeline_size.py     |  89 +-
 test_runner/regress/test_wal_acceptor.py      |  24 +-
 36 files changed, 2556 insertions(+), 711 deletions(-)
 create mode 100644 pageserver/src/tenant/remote_layer.rs
 create mode 100644 test_runner/regress/test_ondemand_download.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 586ce2a73a..88603d9539 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -163,6 +163,8 @@ pub struct TenantInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub id: TenantId,
     pub state: TenantState,
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
     pub has_in_progress_downloads: Option<bool>,
 }
@@ -191,9 +193,12 @@ pub struct TimelineInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub remote_consistent_lsn: Lsn,
     pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    /// Sum of the size of all layer files.
+    /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
     pub current_logical_size_non_incremental: Option<u64>,
-    pub current_physical_size_non_incremental: Option<u64>,
+
+    pub timeline_dir_layer_file_size_sum: Option<u64>,
 
     pub wal_source_connstr: Option<String>,
     #[serde_as(as = "Option<DisplayFromStr>")]
@@ -205,6 +210,22 @@ pub struct TimelineInfo {
     pub state: TimelineState,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct DownloadRemoteLayersTaskInfo {
+    pub task_id: String,
+    pub state: DownloadRemoteLayersTaskState,
+    pub total_layer_count: u64,         // stable once `completed`
+    pub successful_download_count: u64, // stable once `completed`
+    pub failed_download_count: u64,     // stable once `completed`
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub enum DownloadRemoteLayersTaskState {
+    Running,
+    Completed,
+    ShutDown,
+}
+
 pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
 
 /// Information for configuring a single fail point
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 973c3cd3a6..aa87865a8a 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,7 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;
 
-use crate::tenant::Timeline;
+use crate::task_mgr;
+use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
 use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -152,23 +153,29 @@ where
             SlruKind::MultiXactOffsets,
             SlruKind::MultiXactMembers,
         ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
+            for segno in
+                with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
+            {
                 self.add_slru_segment(kind, segno)?;
             }
         }
 
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
+        for ((spcnode, dbnode), has_relmap_file) in
+            with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
+        {
             self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
+                for rel in with_ondemand_download_sync(|| {
+                    self.timeline.list_rels(spcnode, dbnode, self.lsn)
+                })? {
                     self.add_rel(rel)?;
                 }
             }
         }
-        for xid in self.timeline.list_twophase_files(self.lsn)? {
+        for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
             self.add_twophase_file(xid)?;
         }
 
@@ -185,7 +192,8 @@ where
     }
 
     fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;
+        let nblocks =
+            with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
 
         // Function that adds relation segment data to archive
         let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
@@ -208,7 +216,8 @@ where
             for blknum in blocks {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                    .no_ondemand_download()?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
@@ -222,13 +231,16 @@ where
     // Generate SLRU segment files from repository.
     //
     fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
+        let nblocks = with_ondemand_download_sync(|| {
+            self.timeline.get_slru_segment_size(slru, segno, self.lsn)
+        })?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
-            let img = self
-                .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
+            let img = with_ondemand_download_sync(|| {
+                self.timeline
+                    .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+            })?;
 
             if slru == SlruKind::Clog {
                 ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -260,7 +272,9 @@ where
         has_relmap_file: bool,
     ) -> anyhow::Result<()> {
         let relmap_img = if has_relmap_file {
-            let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
+            let img = with_ondemand_download_sync(|| {
+                self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
+            })?;
             ensure!(img.len() == 512);
             Some(img)
         } else {
@@ -295,7 +309,8 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)?
+                    .list_rels(spcnode, dbnode, self.lsn)
+                    .no_ondemand_download()?
                     .is_empty()
             {
                 return Ok(());
@@ -327,7 +342,7 @@ where
     // Extract twophase state files
     //
     fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn)?;
+        let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -361,14 +376,12 @@ where
             zenith_signal.as_bytes(),
         )?;
 
-        let checkpoint_bytes = self
-            .timeline
-            .get_checkpoint(self.lsn)
-            .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_control_file(self.lsn)
-            .context("failed get control bytes")?;
+        let checkpoint_bytes =
+            with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
+                .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes =
+            with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
+                .context("failed get control bytes")?;
 
         let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
             &pg_control_bytes,
@@ -490,3 +503,11 @@ where
         }
     }
 }
+
+fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
+where
+    F: Send + Fn() -> PageReconstructResult<T>,
+    T: Send,
+{
+    task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
+}
diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
index c5da54b8fc..f9d3e8553f 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/billing_metrics.rs
@@ -73,10 +73,10 @@ pub enum BillingMetricKind {
     /// This is an absolute, per-tenant metric.
     /// This is the same metric that tenant/tenant_id/size endpoint returns.
     SyntheticStorageSize,
-    /// Size of all the files in the tenant's directory on disk on the pageserver.
+    /// Size of all the layer files in the tenant's directory on disk on the pageserver.
     /// This is an absolute, per-tenant metric.
-    /// See also prometheus metric CURRENT_PHYSICAL_SIZE.
-    PhysicalSize,
+    /// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
+    ResidentSize,
     /// Size of the remote storage (S3) directory.
     /// This is an absolute, per-tenant metric.
     RemoteStorageSize,
@@ -89,7 +89,7 @@ impl FromStr for BillingMetricKind {
         match s {
             "written_size" => Ok(Self::WrittenSize),
             "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
-            "physical_size" => Ok(Self::PhysicalSize),
+            "resident_size" => Ok(Self::ResidentSize),
             "remote_storage_size" => Ok(Self::RemoteStorageSize),
             _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
         }
@@ -101,7 +101,7 @@ impl fmt::Display for BillingMetricKind {
         f.write_str(match self {
             BillingMetricKind::WrittenSize => "written_size",
             BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
-            BillingMetricKind::PhysicalSize => "physical_size",
+            BillingMetricKind::ResidentSize => "resident_size",
             BillingMetricKind::RemoteStorageSize => "remote_storage_size",
         })
     }
@@ -171,7 +171,7 @@ pub async fn collect_metrics_task(
 
         let tenant = tenant_mgr::get_tenant(tenant_id, true).await?;
 
-        let mut tenant_physical_size = 0;
+        let mut tenant_resident_size = 0;
 
         // iterate through list of timelines in tenant
         for timeline in tenant.list_timelines().iter() {
@@ -186,27 +186,27 @@ pub async fn collect_metrics_task(
                 timeline_written_size,
             ));
 
-            let timeline_size = timeline.get_physical_size();
-            tenant_physical_size += timeline_size;
+            let timeline_resident_size = timeline.get_resident_physical_size();
+            tenant_resident_size += timeline_resident_size;
 
             debug!(
-                "per-timeline current metrics for tenant: {}: timeline {} physical_size={} last_record_lsn {} (as bytes)",
-                tenant_id, timeline.timeline_id, timeline_size, timeline_written_size)
+                "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
+                tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
         }
 
         let tenant_remote_size = tenant.get_remote_size().await?;
         debug!(
-            "collected current metrics for tenant: {}: state={:?} tenant_physical_size={} remote_size={}",
-            tenant_id, tenant_state, tenant_physical_size, tenant_remote_size
+            "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
+            tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
         );
 
         current_metrics.push((
             BillingMetricsKey {
                 tenant_id,
                 timeline_id: None,
-                metric: BillingMetricKind::PhysicalSize,
+                metric: BillingMetricKind::ResidentSize,
             },
-            tenant_physical_size,
+            tenant_resident_size,
         ));
 
         current_metrics.push((
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 937a6144b6..6d97f3206e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,7 +12,7 @@ use super::models::{
     TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
-use crate::tenant::Timeline;
+use crate::tenant::{with_ondemand_download, Timeline};
 use crate::tenant_config::TenantConfOpt;
 use crate::{config::PageServerConf, tenant_mgr};
 use utils::{
@@ -78,25 +78,23 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 }
 
 // Helper function to construct a TimelineInfo struct for a timeline
-fn build_timeline_info(
+async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
-    include_non_incremental_physical_size: bool,
 ) -> anyhow::Result<TimelineInfo> {
     let mut info = build_timeline_info_common(timeline)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
         // Otherwise, if someone deletes the timeline / detaches the tenant while
         // we're executing this function, we will outlive the timeline on-disk state.
-        info.current_logical_size_non_incremental =
-            Some(timeline.get_current_logical_size_non_incremental(
-                info.last_record_lsn,
-                CancellationToken::new(),
-            )?);
-    }
-    if include_non_incremental_physical_size {
-        info.current_physical_size_non_incremental =
-            Some(timeline.get_physical_size_non_incremental()?)
+        info.current_logical_size_non_incremental = Some(
+            timeline
+                .get_current_logical_size_non_incremental(
+                    info.last_record_lsn,
+                    CancellationToken::new(),
+                )
+                .await?,
+        );
     }
     Ok(info)
 }
@@ -128,7 +126,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
             None
         }
     };
-    let current_physical_size = Some(timeline.get_physical_size());
+    let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
     let state = timeline.current_state();
     let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
 
@@ -145,7 +143,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         current_logical_size,
         current_physical_size,
         current_logical_size_non_incremental: None,
-        current_physical_size_non_incremental: None,
+        timeline_dir_layer_file_size_sum: None,
         wal_source_connstr,
         last_received_msg_lsn,
         last_received_msg_ts,
@@ -198,8 +196,6 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let include_non_incremental_logical_size =
         query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
     check_permission(&request, Some(tenant_id))?;
 
     let response_data = async {
@@ -210,17 +206,16 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
 
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
-            let timeline_info = build_timeline_info(
-                &timeline,
-                include_non_incremental_logical_size,
-                include_non_incremental_physical_size,
-            )
-            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
-            .map_err(ApiError::InternalServerError)?;
+            let timeline_info =
+                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                    .await
+                    .context(
+                        "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
+                    )
+                    .map_err(ApiError::InternalServerError)?;
 
             response_data.push(timeline_info);
         }
-
         Ok(response_data)
     }
     .instrument(info_span!("timeline_list", tenant = %tenant_id))
@@ -264,8 +259,6 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let include_non_incremental_logical_size =
         query_param_present(&request, "include-non-incremental-logical-size");
-    let include_non_incremental_physical_size =
-        query_param_present(&request, "include-non-incremental-physical-size");
     check_permission(&request, Some(tenant_id))?;
 
     let timeline_info = async {
@@ -277,13 +270,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .get_timeline(timeline_id, false)
             .map_err(ApiError::NotFound)?;
 
-        let timeline_info = build_timeline_info(
-            &timeline,
-            include_non_incremental_logical_size,
-            include_non_incremental_physical_size,
-        )
-        .context("Failed to get local timeline info: {e:#}")
-        .map_err(ApiError::InternalServerError)?;
+        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
+            .await
+            .context("Failed to get local timeline info: {e:#}")
+            .map_err(ApiError::InternalServerError)?;
 
         Ok::<_, ApiError>(timeline_info)
     }
@@ -308,10 +298,11 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
-    let result = match timeline
-        .find_lsn_for_timestamp(timestamp_pg)
-        .map_err(ApiError::InternalServerError)?
-    {
+    let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let result = match result {
         LsnForTimestamp::Present(lsn) => format!("{lsn}"),
         LsnForTimestamp::Future(_lsn) => "future".into(),
         LsnForTimestamp::Past(_lsn) => "past".into(),
@@ -433,7 +424,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
         for timeline in tenant.list_timelines().iter() {
-            current_physical_size += timeline.get_physical_size();
+            current_physical_size += timeline.layer_size_sum().approximate_is_ok();
         }
 
         let state = tenant.current_state();
@@ -786,6 +777,45 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     json_response(StatusCode::OK, ())
 }
 
+async fn timeline_download_remote_layers_handler_post(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    match timeline.spawn_download_all_remote_layers().await {
+        Ok(st) => json_response(StatusCode::ACCEPTED, st),
+        Err(st) => json_response(StatusCode::CONFLICT, st),
+    }
+}
+
+async fn timeline_download_remote_layers_handler_get(
+    request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+    let info = timeline
+        .get_download_all_remote_layers_task_info()
+        .context("task never started since last pageserver process start")
+        .map_err(ApiError::NotFound)?;
+    json_response(StatusCode::OK, info)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -870,6 +900,14 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
             testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_post,
+        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            timeline_download_remote_layers_handler_get,
+        )
         .delete(
             "/v1/tenant/:tenant_id/timeline/:timeline_id",
             timeline_delete_handler,
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index db83bdb3a1..1684ca3c64 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -187,13 +187,13 @@ fn import_slru<Reader: Read>(
     path: &Path,
     mut reader: Reader,
     len: usize,
-) -> Result<()> {
-    trace!("importing slru file {}", path.display());
+) -> anyhow::Result<()> {
+    info!("importing slru file {path:?}");
 
     let mut buf: [u8; 8192] = [0u8; 8192];
     let filename = &path
         .file_name()
-        .expect("missing slru filename")
+        .with_context(|| format!("missing slru filename for path {path:?}"))?
         .to_string_lossy();
     let segno = u32::from_str_radix(filename, 16)?;
 
@@ -279,7 +279,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .no_ondemand_download()?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -405,7 +407,9 @@ pub fn import_wal_from_tar<Reader: Read>(
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .no_ondemand_download()?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 626d5e99e3..e01eb12b7b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -91,7 +91,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
     }
 }
 
-fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
     if n == 0 {
         0.0
     } else {
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 308f9cd4eb..205ee0ffad 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,13 +84,10 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-// Metrics for determining timeline's physical size.
-// A layered timeline's physical is defined as the total size of
-// (delta/image) layer files on disk.
-static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
-        "pageserver_current_physical_size",
-        "Current physical size grouped by timeline",
+        "pageserver_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
         &["tenant_id", "timeline_id"]
     )
     .expect("failed to define a metric")
@@ -146,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
     1.0,      // 1 sec
 ];
 
-const STORAGE_IO_TIME_OPERATIONS: &[&str] =
-    &["open", "close", "read", "write", "seek", "fsync", "gc"];
+const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
+    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
+];
 
 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
 
@@ -375,7 +373,7 @@ pub struct TimelineMetrics {
     pub load_layer_map_histo: Histogram,
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
-    pub current_physical_size_gauge: UIntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
@@ -416,7 +414,7 @@ impl TimelineMetrics {
         let wait_lsn_time_histo = WAIT_LSN_TIME
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
+        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
@@ -442,7 +440,7 @@ impl TimelineMetrics {
             load_layer_map_histo,
             last_record_gauge,
             wait_lsn_time_histo,
-            current_physical_size_gauge,
+            resident_physical_size_gauge,
             current_logical_size_gauge,
             num_persistent_files_created,
             persistent_bytes_written,
@@ -458,7 +456,7 @@ impl Drop for TimelineMetrics {
         let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
         let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d9c19d04b7..fd4353a421 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -541,7 +541,10 @@ impl PageServerHandler {
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?;
+        let exists = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_exists(req.rel, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
             exists,
@@ -558,7 +561,10 @@ impl PageServerHandler {
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?;
+        let n_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_rel_size(req.rel, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
@@ -575,9 +581,10 @@ impl PageServerHandler {
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
 
-        let total_blocks =
-            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?;
-
+        let total_blocks = crate::tenant::with_ondemand_download(|| {
+            timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+        })
+        .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
         Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
@@ -603,11 +610,14 @@ impl PageServerHandler {
         }
         */
 
-        // FIXME: this profiling now happens at different place than it used to. The
-        // current profiling is based on a thread-local variable, so it doesn't work
-        // across awaits
-        let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
-        let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
+        let page = crate::tenant::with_ondemand_download(|| {
+            // FIXME: this profiling now happens at different place than it used to. The
+            // current profiling is based on a thread-local variable, so it doesn't work
+            // across awaits
+            let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
+            timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+        })
+        .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
             page,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7b4b05ed18..77910bceda 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,11 +6,12 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
+use super::tenant::PageReconstructResult;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{self, bail, ensure, Context};
+use crate::{repository::*, try_no_ondemand_download};
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -97,16 +98,18 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
-    ) -> anyhow::Result<Bytes> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    ) -> PageReconstructResult<Bytes> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest)?;
+        let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
                 tag, blknum, lsn, nblocks
             );
-            return Ok(ZERO_PAGE.clone());
+            return PageReconstructResult::Success(ZERO_PAGE.clone());
         }
 
         let key = rel_block_to_key(tag, blknum);
@@ -120,38 +123,45 @@ impl Timeline {
         dbnode: Oid,
         lsn: Lsn,
         latest: bool,
-    ) -> anyhow::Result<usize> {
+    ) -> PageReconstructResult<usize> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn)?;
+        let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest)?;
+            let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
             total_blocks += n_blocks as usize;
         }
-        Ok(total_blocks)
+        PageReconstructResult::Success(total_blocks)
     }
 
     /// Get size of a relation file
-    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> anyhow::Result<BlockNumber> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_size(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        latest: bool,
+    ) -> PageReconstructResult<BlockNumber> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
         if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(nblocks);
+            return PageReconstructResult::Success(nblocks);
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest)?
+            && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
             // without extending it.  Tolerate that by claiming that
             // any non-existent FSM fork has size 0.
-            return Ok(0);
+            return PageReconstructResult::Success(0);
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn)?;
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -164,25 +174,35 @@ impl Timeline {
             // associated with most recent value of LSN.
             self.update_cached_rel_size(tag, lsn, nblocks);
         }
-        Ok(nblocks)
+        PageReconstructResult::Success(nblocks)
     }
 
     /// Does relation exist?
-    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> anyhow::Result<bool> {
-        ensure!(tag.relnode != 0, "invalid relnode");
+    pub fn get_rel_exists(
+        &self,
+        tag: RelTag,
+        lsn: Lsn,
+        _latest: bool,
+    ) -> PageReconstructResult<bool> {
+        if tag.relnode == 0 {
+            return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
+        }
 
         // first try to lookup relation in cache
         if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
-            return Ok(true);
+            return PageReconstructResult::Success(true);
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
-
-        Ok(exists)
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Get a list of all existing relations in given tablespace and database.
@@ -191,21 +211,25 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
-    ) -> anyhow::Result<HashSet<RelTag>> {
+    ) -> PageReconstructResult<HashSet<RelTag>> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn)?;
-        let dir = RelDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let rels: HashSet<RelTag> =
-            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
-                spcnode,
-                dbnode,
-                relnode: *relnode,
-                forknum: *forknum,
-            }));
+        match RelDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let rels: HashSet<RelTag> =
+                    HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
+                        spcnode,
+                        dbnode,
+                        relnode: *relnode,
+                        forknum: *forknum,
+                    }));
 
-        Ok(rels)
+                PageReconstructResult::Success(rels)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Look up given SLRU page version.
@@ -215,7 +239,7 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
-    ) -> anyhow::Result<Bytes> {
+    ) -> PageReconstructResult<Bytes> {
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn)
     }
@@ -226,10 +250,10 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
-    ) -> anyhow::Result<BlockNumber> {
+    ) -> PageReconstructResult<BlockNumber> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn)?;
-        Ok(buf.get_u32_le())
+        let mut buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf.get_u32_le())
     }
 
     /// Get size of an SLRU segment
@@ -238,14 +262,18 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
-    ) -> anyhow::Result<bool> {
+    ) -> PageReconstructResult<bool> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
 
-        let exists = dir.segments.get(&segno).is_some();
-        Ok(exists)
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => {
+                let exists = dir.segments.get(&segno).is_some();
+                PageReconstructResult::Success(exists)
+            }
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
     /// Locate LSN, such that all transactions that committed before
@@ -258,7 +286,7 @@ impl Timeline {
     pub fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
-    ) -> anyhow::Result<LsnForTimestamp> {
+    ) -> PageReconstructResult<LsnForTimestamp> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
         let max_lsn = self.get_last_record_lsn();
@@ -274,12 +302,12 @@ impl Timeline {
             // cannot overflow, high and low are both smaller than u64::MAX / 2
             let mid = (high + low) / 2;
 
-            let cmp = self.is_latest_commit_timestamp_ge_than(
+            let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
                 search_timestamp,
                 Lsn(mid * 8),
                 &mut found_smaller,
                 &mut found_larger,
-            )?;
+            ));
 
             if cmp {
                 high = mid;
@@ -291,15 +319,15 @@ impl Timeline {
             (false, false) => {
                 // This can happen if no commit records have been processed yet, e.g.
                 // just after importing a cluster.
-                Ok(LsnForTimestamp::NoData(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
             }
             (true, false) => {
                 // Didn't find any commit timestamps larger than the request
-                Ok(LsnForTimestamp::Future(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
             }
             (false, true) => {
                 // Didn't find any commit timestamps smaller than the request
-                Ok(LsnForTimestamp::Past(max_lsn))
+                PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
             }
             (true, true) => {
                 // low is the LSN of the first commit record *after* the search_timestamp,
@@ -309,7 +337,7 @@ impl Timeline {
                 // Otherwise, if you restore to the returned LSN, the database will
                 // include physical changes from later commits that will be marked
                 // as aborted, and will need to be vacuumed away.
-                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+                PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
             }
         }
     }
@@ -327,12 +355,20 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
-    ) -> anyhow::Result<bool> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
-            let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
+    ) -> PageReconstructResult<bool> {
+        for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
+            let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
+                SlruKind::Clog,
+                segno,
+                probe_lsn
+            ));
             for blknum in (0..nblocks).rev() {
-                let clog_page =
-                    self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
+                let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
+                    SlruKind::Clog,
+                    segno,
+                    blknum,
+                    probe_lsn
+                ));
 
                 if clog_page.len() == BLCKSZ as usize + 8 {
                     let mut timestamp_bytes = [0u8; 8];
@@ -341,61 +377,75 @@ impl Timeline {
 
                     if timestamp >= search_timestamp {
                         *found_larger = true;
-                        return Ok(true);
+                        return PageReconstructResult::Success(true);
                     } else {
                         *found_smaller = true;
                     }
                 }
             }
         }
-        Ok(false)
+        PageReconstructResult::Success(false)
     }
 
     /// Get a list of SLRU segments
-    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> anyhow::Result<HashSet<u32>> {
+    pub fn list_slru_segments(
+        &self,
+        kind: SlruKind,
+        lsn: Lsn,
+    ) -> PageReconstructResult<HashSet<u32>> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn)?;
-        let dir = SlruSegmentDirectory::des(&buf)?;
-
-        Ok(dir.segments)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.segments),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_relmap_file(
+        &self,
+        spcnode: Oid,
+        dbnode: Oid,
+        lsn: Lsn,
+    ) -> PageReconstructResult<Bytes> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
     }
 
-    pub fn list_dbdirs(&self, lsn: Lsn) -> anyhow::Result<HashMap<(Oid, Oid), bool>> {
+    pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
         // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dir = DbDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
 
-        Ok(dir.dbdirs)
+        match DbDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
         let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn)?;
-        Ok(buf)
+        let buf = try_no_ondemand_download!(self.get(key, lsn));
+        PageReconstructResult::Success(buf)
     }
 
-    pub fn list_twophase_files(&self, lsn: Lsn) -> anyhow::Result<HashSet<TransactionId>> {
+    pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
         // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
 
-        Ok(dir.xids)
+        match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
+            Ok(dir) => PageReconstructResult::Success(dir.xids),
+            Err(e) => PageReconstructResult::from(e),
+        }
     }
 
-    pub fn get_control_file(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
         self.get(CONTROLFILE_KEY, lsn)
     }
 
-    pub fn get_checkpoint(&self, lsn: Lsn) -> anyhow::Result<Bytes> {
+    pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
         self.get(CHECKPOINT_KEY, lsn)
     }
 
@@ -404,23 +454,26 @@ impl Timeline {
     ///
     /// Only relation blocks are counted currently. That excludes metadata,
     /// SLRUs, twophase files etc.
-    pub fn get_current_logical_size_non_incremental(
+    pub async fn get_current_logical_size_non_incremental(
         &self,
         lsn: Lsn,
         cancel: CancellationToken,
-    ) -> std::result::Result<u64, CalculateLogicalSizeError> {
+    ) -> Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
+            for rel in
+                crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
+                    .await?
+            {
                 if cancel.is_cancelled() {
                     return Err(CalculateLogicalSizeError::Cancelled);
                 }
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                 let relsize = buf.get_u32_le();
 
                 total_size += relsize as u64;
@@ -433,7 +486,7 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
+    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -441,8 +494,8 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn)?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let buf = self.get_download(DBDIR_KEY, lsn).await?;
+        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
         dbs.sort_unstable();
@@ -451,14 +504,15 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)?
+                .list_rels(spcnode, dbnode, lsn)
+                .no_ondemand_download()?
                 .iter()
                 .cloned()
                 .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn)?;
+                let mut buf = self.get_download(relsize_key, lsn).await?;
                 let relsize = buf.get_u32_le();
 
                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -474,13 +528,13 @@ impl Timeline {
         ] {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn)?;
-            let dir = SlruSegmentDirectory::des(&buf)?;
+            let buf = self.get_download(slrudir_key, lsn).await?;
+            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
                 let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn)?;
+                let mut buf = self.get_download(segsize_key, lsn).await?;
                 let segsize = buf.get_u32_le();
 
                 result.add_range(
@@ -492,8 +546,8 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
+        let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
         for xid in xids {
@@ -606,7 +660,7 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         rec: NeonWalRecord,
     ) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
     }
@@ -633,7 +687,7 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
     }
@@ -652,7 +706,7 @@ impl<'a> DatadirModification<'a> {
     /// Store a relmapper file (pg_filenode.map) in the repository
     pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
@@ -680,10 +734,10 @@ impl<'a> DatadirModification<'a> {
 
     pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
         if !dir.xids.insert(xid) {
-            bail!("twophase file for xid {} already exists", xid);
+            anyhow::bail!("twophase file for xid {} already exists", xid);
         }
         self.put(
             TWOPHASEDIR_KEY,
@@ -707,10 +761,13 @@ impl<'a> DatadirModification<'a> {
     pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
-        let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
+        let total_blocks = self
+            .tline
+            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .no_ondemand_download()?;
 
         // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY)?;
+        let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
@@ -734,10 +791,10 @@ impl<'a> DatadirModification<'a> {
     ///
     /// 'nblocks' is the initial size.
     pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
         let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
             // Didn't exist. Update dbdir
@@ -749,12 +806,12 @@ impl<'a> DatadirModification<'a> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key)?)?
+            RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
         };
 
         // Add the new relation to the rel directory entry, and write it back
         if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            bail!("rel {} already exists", rel);
+            anyhow::bail!("rel {rel} already exists");
         }
         self.put(
             rel_dir_key,
@@ -778,12 +835,16 @@ impl<'a> DatadirModification<'a> {
 
     /// Truncate relation
     pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true)? {
+        if self
+            .tline
+            .get_rel_exists(rel, last_lsn, true)
+            .no_ondemand_download()?
+        {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
-            let old_size = self.get(size_key)?.get_u32_le();
+            let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
 
             // Update the entry with the new size.
             let buf = nblocks.to_le_bytes();
@@ -804,11 +865,11 @@ impl<'a> DatadirModification<'a> {
     /// Extend relation
     /// If new size is smaller, do nothing.
     pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
 
         // only extend relation here. never decrease the size
         if nblocks > old_size {
@@ -825,11 +886,11 @@ impl<'a> DatadirModification<'a> {
 
     /// Drop a relation.
     pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
-        ensure!(rel.relnode != 0, "invalid relnode");
+        anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
         let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = RelDirectory::des(&buf)?;
 
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -840,7 +901,7 @@ impl<'a> DatadirModification<'a> {
 
         // update logical size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key)?.get_u32_le();
+        let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
         self.pending_nblocks -= old_size as i64;
 
         // Remove enty from relation size cache
@@ -860,11 +921,11 @@ impl<'a> DatadirModification<'a> {
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
-            bail!("slru segment {:?}/{} already exists", kind, segno);
+            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
         }
         self.put(
             dir_key,
@@ -899,7 +960,7 @@ impl<'a> DatadirModification<'a> {
     pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key)?;
+        let buf = self.get(dir_key).no_ondemand_download()?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.remove(&segno) {
@@ -925,7 +986,7 @@ impl<'a> DatadirModification<'a> {
     /// This method is used for marking truncated SLRU files
     pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
         // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY)?;
+        let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
 
         if !dir.xids.remove(&xid) {
@@ -1019,7 +1080,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    fn get(&self, key: Key) -> anyhow::Result<Bytes> {
+    fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1027,14 +1088,16 @@ impl<'a> DatadirModification<'a> {
         // value that has been removed, deletion only avoids leaking storage.
         if let Some(value) = self.pending_updates.get(&key) {
             if let Value::Image(img) = value {
-                Ok(img.clone())
+                PageReconstructResult::Success(img.clone())
             } else {
                 // Currently, we never need to read back a WAL record that we
                 // inserted in the same "transaction". All the metadata updates
                 // work directly with Images, and we never need to read actual
                 // data pages. We could handle this if we had to, by calling
                 // the walredo manager, but let's keep it simple for now.
-                bail!("unexpected pending WAL record");
+                return PageReconstructResult::from(anyhow::anyhow!(
+                    "unexpected pending WAL record"
+                ));
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1400,7 +1463,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
             },
             key.field6,
         ),
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
 
@@ -1426,14 +1489,14 @@ pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber
                 0x00 => SlruKind::Clog,
                 0x01 => SlruKind::MultiXactMembers,
                 0x02 => SlruKind::MultiXactOffsets,
-                _ => bail!("unrecognized slru kind 0x{:02x}", key.field2),
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
             };
             let segno = key.field4;
             let blknum = key.field6;
 
             (kind, segno, blknum)
         }
-        _ => bail!("unexpected value kind 0x{:02x}", key.field1),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
 
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 9253b250cd..a2337e8fd6 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -148,31 +148,43 @@
 //! following two cases:
 //! - (1) We had the file locally, deleted it locally, scheduled a remote delete,
 //!   but crashed before it finished remotely.
-//! - (2) We never had the file locally because we were still in tenant attach
-//!   when we crashed. (Similar case for on-demand download in the future.)
+//! - (2) We never had the file locally because we haven't on-demand downloaded
+//!   it yet.
 //!
-//! # Downloads (= Tenant Attach)
+//! # Downloads
 //!
 //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
-//! downloading files from the remote storage. Downloads are performed immediately,
-//! independently of the uploads.
+//! downloading files from the remote storage. Downloads are performed immediately
+//! against the `RemoteStorage`, independently of the upload queue.
 //!
 //! When we attach a tenant, we perform the following steps:
 //! - create `Tenant` object in `TenantState::Attaching` state
-//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
-//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart`
-//! - eagerly download all the remote layers using the client's download APIs
-//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state.
+//! - List timelines that are present in remote storage, and for each:
+//!   - download their remote [`IndexPart`]s
+//!   - create `Timeline` struct and a `RemoteTimelineClient`
+//!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
+//!     but not present locally
+//!   - schedule uploads for layers that are only present locally.
+//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
+//!     the local filesystem, write the remote metadata to the local filesystem
+//! - After the above is done for each timeline, open the tenant for business by
+//!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
+//!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! Most of the above happens in [`Timeline::reconcile_with_remote`].
+//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
 //! We keep track of the fact that a client is in `Attaching` state in a marker
-//! file on the local disk.
-//! However, the distinction is moot for storage sync since we call
-//! `reconcile_with_remote` for tenants both with and without the marker file.
-//!
-//! In the future, downloading will be done on-demand and `reconcile_with_remote`
-//! will only be responsible for re-scheduling upload ops after a crash of an
-//! `Active` tenant.
+//! file on the local disk. This is critical because, when we restart the pageserver,
+//! we do not want to do the `List timelines` step for each tenant that has already
+//! been successfully attached (for performance & cost reasons).
+//! Instead, for a tenant without the attach marker file, we assume that the
+//! local state is in sync or ahead of the remote state. This includes the list
+//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
+//! if there's a timeline on the remote that the pageserver doesn't know about,
+//! the GC will not consider its branch point, leading to data loss.
+//! So, for a tenant with the attach marker file, we know that we do not yet have
+//! persisted all the remote timeline's metadata files locally. To exclude the
+//! risk above, we re-run the procedure for such tenants
 //!
 //! # Operating Without Remote Storage
 //!
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs
index c81be05981..4256767020 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -180,6 +180,10 @@ pub async fn list_remote_timelines<'a>(
     let tenant_path = conf.timelines_path(&tenant_id);
     let tenant_storage_path = conf.remote_path(&tenant_path)?;
 
+    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
+        anyhow::bail!("storage-sync-list-remote-timelines");
+    });
+
     let timelines = download_retry(
         || storage.list_prefixes(Some(&tenant_storage_path)),
         &format!("list prefixes for {tenant_path:?}"),
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index fe3ad1a57d..a1b3ad26b0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -35,6 +35,7 @@
 #![allow(clippy::declare_interior_mutable_const)]
 
 use std::collections::HashMap;
+use std::fmt;
 use std::future::Future;
 use std::panic::AssertUnwindSafe;
 use std::sync::atomic::{AtomicU64, Ordering};
@@ -134,8 +135,15 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .expect("Failed to create background op runtime")
 });
 
+#[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
 
+impl fmt::Display for PageserverTaskId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Each task that we track is associated with a "task ID". It's just an
 /// increasing number that we assign. Note that it is different from tokio::task::Id.
 static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
@@ -198,6 +206,9 @@ pub enum TaskKind {
     // Task that uploads a file to remote storage
     RemoteUploadTask,
 
+    // Task that downloads a file from remote storage
+    RemoteDownloadTask,
+
     // task that handles the initial downloading of all tenants
     InitialLoad,
 
@@ -206,6 +217,9 @@ pub enum TaskKind {
 
     // task that handhes metrics collection
     MetricsCollection,
+
+    // task that drives downloading layers
+    DownloadAllRemoteLayers,
 }
 
 #[derive(Default)]
@@ -437,6 +451,10 @@ pub fn current_task_kind() -> Option<TaskKind> {
     CURRENT_TASK.try_with(|ct| ct.kind).ok()
 }
 
+pub fn current_task_id() -> Option<PageserverTaskId> {
+    CURRENT_TASK.try_with(|ct| ct.task_id).ok()
+}
+
 /// A Future that can be used to check if the current task has been requested to
 /// shut down.
 pub async fn shutdown_watcher() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 799a34fb3b..1240a3b4fb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -81,6 +81,7 @@ pub mod filename;
 mod image_layer;
 mod inmemory_layer;
 pub mod layer_map;
+mod remote_layer;
 
 pub mod metadata;
 mod par_fsync;
@@ -90,7 +91,7 @@ mod timeline;
 
 pub mod size;
 
-pub use timeline::Timeline;
+pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline};
 
 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -2780,9 +2781,18 @@ mod tests {
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );
 
         Ok(())
     }
@@ -2859,15 +2869,15 @@ mod tests {
 
         // Check page contents on both branches
         assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
             "foo at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
             "bar at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?,
             "foobar at 0x20"
         );
 
@@ -3026,7 +3036,10 @@ mod tests {
         tenant
             .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
             .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
+        assert!(newtline
+            .get(*TEST_KEY, Lsn(0x25))
+            .no_ondemand_download()
+            .is_ok());
 
         Ok(())
     }
@@ -3056,7 +3069,7 @@ mod tests {
 
         // Check that the data is still accessible on the branch.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50))?,
+            newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?,
             TEST_IMG(&format!("foo at {}", Lsn(0x40)))
         );
 
@@ -3203,11 +3216,26 @@ mod tests {
         tline.freeze_and_flush().await?;
         tline.compact().await?;
 
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x10")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x20")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x30")
+        );
+        assert_eq!(
+            tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?,
+            TEST_IMG("foo at 0x40")
+        );
 
         Ok(())
     }
@@ -3315,7 +3343,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3401,7 +3429,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn)?,
+                    tline.get(test_key, lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3476,7 +3504,7 @@ mod tests {
                 println!("checking [{idx}][{blknum}] at {lsn}");
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, *lsn)?,
+                    tline.get(test_key, *lsn).no_ondemand_download()?,
                     TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs
index e1006dfe00..5b724b6263 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
@@ -183,6 +183,8 @@ pub struct DeltaLayer {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
 
+    pub file_size: u64,
+
     inner: RwLock<DeltaLayerInner>,
 }
 
@@ -411,6 +413,10 @@ impl PersistentLayer for DeltaLayer {
         fs::remove_file(self.path())?;
         Ok(())
     }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }
 
 impl DeltaLayer {
@@ -535,6 +541,7 @@ impl DeltaLayer {
         timeline_id: TimelineId,
         tenant_id: TenantId,
         filename: &DeltaFileName,
+        file_size: u64,
     ) -> DeltaLayer {
         DeltaLayer {
             path_or_conf: PathOrConf::Conf(conf),
@@ -542,6 +549,7 @@ impl DeltaLayer {
             tenant_id,
             key_range: filename.key_range.clone(),
             lsn_range: filename.lsn_range.clone(),
+            file_size,
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
@@ -554,21 +562,23 @@ impl DeltaLayer {
     /// Create a DeltaLayer struct representing an existing file on disk.
     ///
     /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
-    where
-        F: FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
         file.read_exact_at(&mut summary_buf, 0)?;
         let summary = Summary::des_prefix(&summary_buf)?;
 
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
         Ok(DeltaLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
             timeline_id: summary.timeline_id,
             tenant_id: summary.tenant_id,
             key_range: summary.key_range,
             lsn_range: summary.lsn_range,
+            file_size: metadata.len(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
@@ -725,6 +735,10 @@ impl DeltaLayerWriterInner {
         file.seek(SeekFrom::Start(0))?;
         Summary::ser_into(&summary, &mut file)?;
 
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
+
         // Note: Because we opened the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -734,6 +748,7 @@ impl DeltaLayerWriterInner {
             timeline_id: self.timeline_id,
             key_range: self.key_start..key_end,
             lsn_range: self.lsn_range.clone(),
+            file_size: metadata.len(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
                 file: None,
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index b1dbbfb683..1e129fc01d 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -36,10 +36,11 @@ use bytes::Bytes;
 use hex;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs;
+use std::fs::{self, File};
 use std::io::Write;
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
+use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;
@@ -105,6 +106,7 @@ pub struct ImageLayer {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
     pub key_range: Range<Key>,
+    pub file_size: u64,
 
     // This entry contains an image of all pages as of this LSN
     pub lsn: Lsn,
@@ -228,6 +230,10 @@ impl PersistentLayer for ImageLayer {
         fs::remove_file(self.path())?;
         Ok(())
     }
+
+    fn file_size(&self) -> Option<u64> {
+        Some(self.file_size)
+    }
 }
 
 impl ImageLayer {
@@ -344,6 +350,7 @@ impl ImageLayer {
         timeline_id: TimelineId,
         tenant_id: TenantId,
         filename: &ImageFileName,
+        file_size: u64,
     ) -> ImageLayer {
         ImageLayer {
             path_or_conf: PathOrConf::Conf(conf),
@@ -351,6 +358,7 @@ impl ImageLayer {
             tenant_id,
             key_range: filename.key_range.clone(),
             lsn: filename.lsn,
+            file_size,
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
                 file: None,
@@ -363,21 +371,21 @@ impl ImageLayer {
     /// Create an ImageLayer struct representing an existing file on disk.
     ///
     /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
-    pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
-    where
-        F: std::os::unix::prelude::FileExt,
-    {
+    pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
         file.read_exact_at(&mut summary_buf, 0)?;
         let summary = Summary::des_prefix(&summary_buf)?;
-
+        let metadata = file
+            .metadata()
+            .context("get file metadata to determine size")?;
         Ok(ImageLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
             timeline_id: summary.timeline_id,
             tenant_id: summary.tenant_id,
             key_range: summary.key_range,
             lsn: summary.lsn,
+            file_size: metadata.len(),
             inner: RwLock::new(ImageLayerInner {
                 file: None,
                 loaded: false,
@@ -523,6 +531,10 @@ impl ImageLayerWriterInner {
         file.seek(SeekFrom::Start(0))?;
         Summary::ser_into(&summary, &mut file)?;
 
+        let metadata = file
+            .metadata()
+            .context("get metadata to determine file size")?;
+
         // Note: Because we open the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -532,6 +544,7 @@ impl ImageLayerWriterInner {
             tenant_id: self.tenant_id,
             key_range: self.key_range.clone(),
             lsn: self.lsn,
+            file_size: metadata.len(),
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
                 file: None,
diff --git a/pageserver/src/tenant/remote_layer.rs b/pageserver/src/tenant/remote_layer.rs
new file mode 100644
index 0000000000..affe8ca0a8
--- /dev/null
+++ b/pageserver/src/tenant/remote_layer.rs
@@ -0,0 +1,212 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::repository::Key;
+use crate::storage_sync::index::LayerFileMetadata;
+use crate::tenant::delta_layer::DeltaLayer;
+use crate::tenant::filename::{DeltaFileName, ImageFileName};
+use crate::tenant::image_layer::ImageLayer;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use anyhow::{bail, Result};
+use std::ops::Range;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::LayerFileName;
+use super::storage_layer::{LayerIter, LayerKeyIter, PersistentLayer};
+
+#[derive(Debug)]
+pub struct RemoteLayer {
+    tenantid: TenantId,
+    timelineid: TimelineId,
+    key_range: Range<Key>,
+    lsn_range: Range<Lsn>,
+
+    pub file_name: LayerFileName,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    is_delta: bool,
+
+    is_incremental: bool,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+}
+
+impl Layer for RemoteLayer {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        bail!(
+            "layer {} needs to be downloaded",
+            self.filename().file_name()
+        );
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    /// debugging function to print out the contents of the layer
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        println!(
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenantid,
+            self.timelineid,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
+        );
+
+        Ok(())
+    }
+
+    fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn get_tenant_id(&self) -> TenantId {
+        self.tenantid
+    }
+
+    fn get_timeline_id(&self) -> TimelineId {
+        self.timelineid
+    }
+
+    fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            }
+            .into()
+        } else {
+            ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            }
+            .into()
+        }
+    }
+
+    fn local_path(&self) -> Option<PathBuf> {
+        None
+    }
+
+    fn iter(&self) -> Result<LayerIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn delete(&self) -> Result<()> {
+        Ok(())
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn file_size(&self) -> Option<u64> {
+        self.layer_metadata.file_size()
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn..(fname.lsn + 1),
+            is_delta: false,
+            is_incremental: false,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            tenantid,
+            timelineid,
+            key_range: fname.key_range.clone(),
+            lsn_range: fname.lsn_range.clone(),
+            is_delta: true,
+            is_incremental: true,
+            file_name: fname.to_owned().into(),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub fn create_downloaded_layer(
+        &self,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.is_delta {
+            let fname = DeltaFileName {
+                key_range: self.key_range.clone(),
+                lsn_range: self.lsn_range.clone(),
+            };
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        } else {
+            let fname = ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn_range.start,
+            };
+            Arc::new(ImageLayer::new(
+                conf,
+                self.timelineid,
+                self.tenantid,
+                &fname,
+                file_size,
+            ))
+        }
+    }
+}
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 5ce0837562..aa11985cbe 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -97,8 +97,6 @@ pub(super) async fn gather_inputs(
     // used to determine the `retention_period` for the size model
     let mut max_cutoff_distance = None;
 
-    // this will probably conflict with on-demand downloaded layers, or at least force them all
-    // to be downloaded
     for timeline in timelines {
         let last_record_lsn = timeline.get_last_record_lsn();
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 79eaa96591..8bfac5df8e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,6 +8,7 @@ use anyhow::Result;
 use bytes::Bytes;
 use std::ops::Range;
 use std::path::PathBuf;
+use std::sync::Arc;
 
 use utils::{
     id::{TenantId, TimelineId},
@@ -15,6 +16,8 @@ use utils::{
 };
 
 use super::filename::LayerFileName;
+use super::remote_layer::RemoteLayer;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
     T: PartialOrd<T>,
@@ -161,4 +164,28 @@ pub trait PersistentLayer: Layer {
 
     /// Permanently remove this layer from disk.
     fn delete(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    /// Returns None if the layer file size is not known.
+    ///
+    /// Should not change over the lifetime of the layer object because
+    /// current_physical_size is computed as the som of this value.
+    fn file_size(&self) -> Option<u64>;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 61d619a17b..f4288fea36 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,11 +3,14 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pageserver_api::models::TimelineState;
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState,
+};
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
-use tokio::task::spawn_blocking;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
@@ -22,6 +25,7 @@ use std::time::{Duration, Instant, SystemTime};
 
 use crate::storage_sync::index::IndexPart;
 use crate::storage_sync::RemoteTimelineClient;
+use crate::tenant::remote_layer::RemoteLayer;
 use crate::tenant::{
     delta_layer::{DeltaLayer, DeltaLayerWriter},
     ephemeral_file::is_ephemeral_file,
@@ -76,7 +80,7 @@ pub struct Timeline {
     conf: &'static PageServerConf,
     tenant_conf: Arc<RwLock<TenantConfOpt>>,
 
-    _myself: Weak<Self>,
+    myself: Weak<Self>,
 
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
@@ -93,10 +97,7 @@ pub struct Timeline {
     walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
 
     /// Remote storage client.
-    ///
-    /// If Some, use it to upload all newly created layers to the remote storage,
-    /// and keep remote metadata file in sync. In the future, also use it to download
-    /// layer files on-demand.
+    /// See [`storage_sync2`] module comment for details.
     pub remote_client: Option<Arc<RemoteTimelineClient>>,
 
     // What page versions do we hold in the repository? If we get a
@@ -187,6 +188,8 @@ pub struct Timeline {
     /// Relation size cache
     pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
 
+    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
+
     state: watch::Sender<TimelineState>,
 }
 
@@ -308,12 +311,68 @@ impl LogicalSize {
     }
 }
 
+/// Returned by [`Timeline::layer_size_sum`]
+pub enum LayerSizeSum {
+    /// The result is accurate.
+    Accurate(u64),
+    // We don't know the layer file size of one or more layers.
+    // They contribute to the sum with a value of 0.
+    // Hence, the sum is a lower bound for the actualy layer file size sum.
+    ApproximateLowerBound(u64),
+}
+
+impl LayerSizeSum {
+    pub fn approximate_is_ok(self) -> u64 {
+        match self {
+            LayerSizeSum::Accurate(v) => v,
+            LayerSizeSum::ApproximateLowerBound(v) => v,
+        }
+    }
+}
+
 pub struct WalReceiverInfo {
     pub wal_source_connconf: PgConnectionConfig,
     pub last_received_msg_lsn: Lsn,
     pub last_received_msg_ts: u128,
 }
 
+/// Like `?`, but for [`PageReconstructResult`].
+/// Use it to bubble up the `NeedsDownload` and `Error` to the caller.
+///
+/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
+#[macro_export]
+macro_rules! try_no_ondemand_download {
+    ($result:expr) => {{
+        let result = $result;
+        match result {
+            PageReconstructResult::Success(value) => value,
+            PageReconstructResult::NeedsDownload(timeline, layer) => {
+                return PageReconstructResult::NeedsDownload(timeline, layer);
+            }
+            PageReconstructResult::Error(e) => return PageReconstructResult::Error(e),
+        }
+    }};
+}
+
+/// Replacement for `?` in functions that return [`PageReconstructResult`].
+///
+/// Given an `expr: Result<T, E>`, use `try_page_reconstruct_result!(expr)`
+/// instead of `(expr)?`.
+/// If `expr` is `Ok(v)`, the macro evaluates to `v`.
+/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`.
+///
+/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
+#[macro_export]
+macro_rules! try_page_reconstruct_result {
+    ($result:expr) => {{
+        let result = $result;
+        match result {
+            Ok(v) => v,
+            Err(e) => return PageReconstructResult::from(e),
+        }
+    }};
+}
+
 ///
 /// Information about how much history needs to be retained, needed by
 /// Garbage Collection.
@@ -343,6 +402,77 @@ pub struct GcInfo {
     pub pitr_cutoff: Lsn,
 }
 
+pub enum PageReconstructResult<T> {
+    Success(T),
+    /// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map
+    /// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then
+    /// retry the operation that returned this error.
+    NeedsDownload(Weak<Timeline>, Weak<RemoteLayer>),
+    Error(PageReconstructError),
+}
+
+/// An error happened in a get() operation.
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
+
+    #[error(transparent)]
+    WalRedo(#[from] crate::walredo::WalRedoError),
+}
+
+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+/// This impl makes it so you can substitute return type
+/// `Result<T, E>` with `PageReconstructError<T>` in functions
+/// and existing `?` will generally continue to work.
+/// The reason why  thanks to
+/// anyhow::Error that `(some error type)ensures that exis
+impl<E, T> From<E> for PageReconstructResult<T>
+where
+    E: Into<PageReconstructError>,
+{
+    fn from(e: E) -> Self {
+        Self::Error(e.into())
+    }
+}
+
+impl<T> PageReconstructResult<T> {
+    /// Treat the need for on-demand download as an error.
+    ///
+    /// **Avoid this function in new code** if you can help it,
+    /// as on-demand download will become the norm in the future,
+    /// especially once we implement layer file eviction.
+    ///
+    /// If you are in an async function, use [`with_ondemand_download`]
+    /// to do the download right here.
+    ///
+    /// If you are in a sync function, change its return type from
+    /// `Result<T, E>` to `PageReconstructResult<T>` and bubble up
+    /// the non-success cases of `PageReconstructResult<T>` to the caller.
+    /// This gives them a chance to do the download and retry.
+    /// Consider using [`try_no_ondemand_download`] for convenience.
+    ///
+    /// For more background, read the comment on [`with_ondemand_download`].
+    pub fn no_ondemand_download(self) -> anyhow::Result<T> {
+        match self {
+            PageReconstructResult::Success(value) => Ok(value),
+            // TODO print more info about the timeline
+            PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"),
+            PageReconstructResult::Error(e) => {
+                Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
+            }
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -370,8 +500,10 @@ impl Timeline {
     /// the Repository implementation may incorrectly return a value from an ancestor
     /// branch, for example, or waste a lot of cycles chasing the non-existing key.
     ///
-    pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result<Bytes> {
-        anyhow::ensure!(lsn.is_valid(), "Invalid LSN");
+    pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult<Bytes> {
+        if !lsn.is_valid() {
+            return PageReconstructResult::from(anyhow!("Invalid LSN"));
+        }
 
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
@@ -381,7 +513,7 @@ impl Timeline {
             Some((cached_lsn, cached_img)) => {
                 match cached_lsn.cmp(&lsn) {
                     Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image
                     Ordering::Greater => {
                         unreachable!("the returned lsn should never be after the requested lsn")
                     }
@@ -396,13 +528,18 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?;
+        try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state));
 
         self.metrics
             .reconstruct_time_histo
             .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
     }
 
+    // Like get(), but if a remote layer file is needed, it is downloaded as part of this call.
+    pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result<Bytes> {
+        with_ondemand_download(|| self.get(key, lsn)).await
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -429,30 +566,27 @@ impl Timeline {
         }
     }
 
-    /// Get the physical size of the timeline at the latest LSN
-    pub fn get_physical_size(&self) -> u64 {
-        self.metrics.current_physical_size_gauge.get()
+    /// The sum of the file size of all historic layers in the layer map.
+    /// This method makes no distinction between local and remote layers.
+    /// Hence, the result **does not represent local filesystem usage**.
+    pub fn layer_size_sum(&self) -> LayerSizeSum {
+        let layer_map = self.layers.read().unwrap();
+        let mut size = 0;
+        let mut no_size_cnt = 0;
+        for l in layer_map.iter_historic_layers() {
+            let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1));
+            size += l_size;
+            no_size_cnt += l_no_size;
+        }
+        if no_size_cnt == 0 {
+            LayerSizeSum::Accurate(size)
+        } else {
+            LayerSizeSum::ApproximateLowerBound(size)
+        }
     }
 
-    /// Get the physical size of the timeline at the latest LSN non incrementally
-    pub fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
-        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
-        // total size of layer files in the current timeline directory
-        let mut total_physical_size = 0;
-
-        for direntry in fs::read_dir(timeline_path)? {
-            let direntry = direntry?;
-            let fname = direntry.file_name();
-            let fname = fname.to_string_lossy();
-
-            if ImageFileName::parse_str(&fname).is_some()
-                || DeltaFileName::parse_str(&fname).is_some()
-            {
-                total_physical_size += direntry.metadata()?.len();
-            }
-        }
-
-        Ok(total_physical_size)
+    pub fn get_resident_physical_size(&self) -> u64 {
+        self.metrics.resident_physical_size_gauge.get()
     }
 
     ///
@@ -560,14 +694,18 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
-        match self.repartition(
-            self.get_last_record_lsn(),
-            self.get_compaction_target_size(),
-        ) {
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+            )
+            .await
+        {
             Ok((partitioning, lsn)) => {
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
-                let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
+                let layer_paths_to_upload =
+                    self.create_image_layers(&partitioning, lsn, false).await?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -761,7 +899,7 @@ impl Timeline {
             let mut result = Timeline {
                 conf,
                 tenant_conf,
-                _myself: myself.clone(),
+                myself: myself.clone(),
                 timeline_id,
                 tenant_id,
                 pg_version,
@@ -817,6 +955,9 @@ impl Timeline {
 
                 last_received_wal: Mutex::new(None),
                 rel_size_cache: RwLock::new(HashMap::new()),
+
+                download_all_remote_layers_task_info: RwLock::new(None),
+
                 state,
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
@@ -935,11 +1076,18 @@ impl Timeline {
                     continue;
                 }
 
-                let layer =
-                    ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);
+                let file_size = direntry_path.metadata()?.len();
+
+                let layer = ImageLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &imgfilename,
+                    file_size,
+                );
 
                 trace!("found layer {}", layer.path().display());
-                total_physical_size += layer.path().metadata()?.len();
+                total_physical_size += file_size;
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
@@ -959,11 +1107,18 @@ impl Timeline {
                     continue;
                 }
 
-                let layer =
-                    DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);
+                let file_size = direntry_path.metadata()?.len();
+
+                let layer = DeltaLayer::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    &deltafilename,
+                    file_size,
+                );
 
                 trace!("found layer {}", layer.path().display());
-                total_physical_size += layer.path().metadata()?.len();
+                total_physical_size += file_size;
                 layers.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
@@ -997,7 +1152,7 @@ impl Timeline {
             num_layers, disk_consistent_lsn, total_physical_size
         );
         self.metrics
-            .current_physical_size_gauge
+            .resident_physical_size_gauge
             .set(total_physical_size);
 
         timer.stop_and_record();
@@ -1005,21 +1160,14 @@ impl Timeline {
         Ok(())
     }
 
-    async fn download_missing(
+    async fn create_remote_layers(
         &self,
         index_part: &IndexPart,
-        remote_client: &RemoteTimelineClient,
         local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
         up_to_date_disk_consistent_lsn: Lsn,
     ) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
         // Are we missing some files that are present in remote storage?
-        // Download them now.
-        // TODO Downloading many files this way is not efficient.
-        //     Better to use FuturesUnordered. Maybe keep as is because:
-        //    a) inplace download is a throw-away code, on-demand patch doesnt need that
-        //    b) typical case now is that there is nothing to sync, this downloads a lot
-        //       1) if there was another pageserver that came and generated new files
-        //       2) during attach of a timeline with big history which we currently do not do
+        // Create RemoteLayer instances for them.
         let mut local_only_layers = local_layers;
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1033,7 +1181,7 @@ impl Timeline {
             // Is the local layer's size different from the size stored in the
             // remote index file?
             // If so, rename_to_backup those files & replace their local layer with
-            // a RemoteLayer in the laye rmap so that we re-download them on-demand.
+            // a RemoteLayer in the layer map so that we re-download them on-demand.
             if let Some(local_layer) = local_layer {
                 let local_layer_path = local_layer
                     .local_path()
@@ -1058,7 +1206,7 @@ impl Timeline {
                             assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
                             anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                         } else {
-                            self.metrics.current_physical_size_gauge.sub(local_size);
+                            self.metrics.resident_physical_size_gauge.sub(local_size);
                             self.layers.write().unwrap().remove_historic(local_layer);
                             // fall-through to adding the remote layer
                         }
@@ -1079,7 +1227,7 @@ impl Timeline {
             }
 
             info!(
-                "remote layer does not exist locally, downloading it now: {}",
+                "remote layer does not exist locally, creating remote layer: {}",
                 remote_layer_name.file_name()
             );
 
@@ -1093,28 +1241,18 @@ impl Timeline {
                         continue;
                     }
 
-                    trace!("downloading image file: {remote_layer_name:?}");
-                    let downloaded_size = remote_client
-                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
-                        .await
-                        .with_context(|| {
-                            format!("failed to download image layer {remote_layer_name:?}")
-                        })?;
-                    trace!("done");
+                    let remote_layer = RemoteLayer::new_img(
+                        self.tenant_id,
+                        self.timeline_id,
+                        imgfilename,
+                        &remote_layer_metadata,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
 
-                    let image_layer =
-                        ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, imgfilename);
-
-                    self.layers
-                        .write()
-                        .unwrap()
-                        .insert_historic(Arc::new(image_layer));
-                    self.metrics
-                        .current_physical_size_gauge
-                        .add(downloaded_size);
+                    self.layers.write().unwrap().insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
-                    // Create a DeltaLayer struct for each delta file.
+                    // Create a RemoteLayer for the delta file.
                     // The end-LSN is exclusive, while disk_consistent_lsn is
                     // inclusive. For example, if disk_consistent_lsn is 100, it is
                     // OK for a delta layer to have end LSN 101, but if the end LSN
@@ -1122,29 +1260,19 @@ impl Timeline {
                     // before crash.
                     if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
                         warn!(
-                        "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
-                        deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
-                    );
+                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
+                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
+                        );
                         continue;
                     }
-
-                    trace!("downloading delta file: {remote_layer_name:?}");
-                    let sz = remote_client
-                        .download_layer_file(remote_layer_name, &remote_layer_metadata)
-                        .await
-                        .with_context(|| {
-                            format!("failed to download delta layer {remote_layer_name:?}")
-                        })?;
-                    trace!("done");
-
-                    let delta_layer =
-                        DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, deltafilename);
-
-                    self.layers
-                        .write()
-                        .unwrap()
-                        .insert_historic(Arc::new(delta_layer));
-                    self.metrics.current_physical_size_gauge.add(sz);
+                    let remote_layer = RemoteLayer::new_delta(
+                        self.tenant_id,
+                        self.timeline_id,
+                        deltafilename,
+                        &remote_layer_metadata,
+                    );
+                    let remote_layer = Arc::new(remote_layer);
+                    self.layers.write().unwrap().insert_historic(remote_layer);
                 }
                 #[cfg(test)]
                 LayerFileName::Test(_) => unreachable!(),
@@ -1154,22 +1282,22 @@ impl Timeline {
         Ok(local_only_layers)
     }
 
+    /// This function will synchronize local state with what we have in remote storage.
     ///
-    /// This function will synchronize local data with what we have in remote storage.
-    /// 1. It will download missing layer files.
-    /// 2. It will update local metadata if remote one has greater `disk_consistent_lsn`.
-    /// 3. It will upload files that are missing on the remote
-    /// 4. It will update index file on the remote accordingly
-    /// TODO may be a bit cleaner to do things based on populated remote client,
-    ///     and then do things based on its upload_queue.latest_files
+    /// Steps taken:
+    /// 1. Initialize upload queue based on `index_part`.
+    /// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
+    ///    The list of layers on the remote comes from `index_part`.
+    ///    The list of local layers is given by the layer map's `iter_historic_layers()`.
+    ///    So, the layer map must have been loaded already.
+    /// 3. Schedule upload of local-only layer files (which will then also update the remote
+    ///    IndexPart to include the new layer files).
     ///
-    /// This is used during tenant attach. The layer map must have been loaded
-    /// with local filesystem contents already.
-    ///
-    /// The caller should provide IndexPart if it exists on the remote storage. If it's None,
-    /// we assume that it is missing on the remote storage, which means that we initialized
-    /// a timeline and then restarted before successful upload was performed
+    /// Refer to the `storage_sync2` module comment for more context.
     ///
+    /// # TODO
+    /// May be a bit cleaner to do things based on populated remote client,
+    /// and then do things based on its upload_queue.latest_files.
     #[instrument(skip(self, index_part, up_to_date_metadata))]
     pub async fn reconcile_with_remote(
         &self,
@@ -1199,9 +1327,10 @@ impl Timeline {
                     index_part.timeline_layers.len()
                 );
                 remote_client.init_upload_queue(index_part)?;
-
-                self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn)
-                    .await?
+                let local_only_filenames = self
+                    .create_remote_layers(index_part, local_layers, disk_consistent_lsn)
+                    .await?;
+                local_only_filenames
             }
             None => {
                 info!("initializing upload queue as empty");
@@ -1323,9 +1452,15 @@ impl Timeline {
 
         let calculation = async {
             let cancel = cancel.child_token();
-            spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn, cancel))
-                .await
-                .context("Failed to spawn calculation result task")?
+            tokio::task::spawn_blocking(move || {
+                // Run in a separate thread since this can do a lot of
+                // synchronous file IO without .await inbetween
+                // if there are no RemoteLayers that would require downloading.
+                let h = tokio::runtime::Handle::current();
+                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel))
+            })
+            .await
+            .context("Failed to spawn calculation result task")?
         };
         let timeline_state_cancellation = async {
             loop {
@@ -1376,7 +1511,7 @@ impl Timeline {
     /// Calculate the logical size of the database at the latest LSN.
     ///
     /// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
-    pub fn calculate_logical_size(
+    async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cancel: CancellationToken,
@@ -1421,7 +1556,9 @@ impl Timeline {
         } else {
             self.metrics.logical_size_histo.start_timer()
         };
-        let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn, cancel)?;
+        let logical_size = self
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel)
+            .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
         Ok(logical_size)
@@ -1458,7 +1595,7 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
         match self.local_path() {
             Some(local_path) => {
                 debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
-                    "need timeline ID to uniquely identify the layer when tranversal crosses ancestor boundary",
+                    "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
                 );
                 format!("{}", local_path.display())
             }
@@ -1497,7 +1634,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
-    ) -> Result<(), PageReconstructError> {
+    ) -> PageReconstructResult<()> {
         // Start from the current timeline.
         let mut timeline_owned;
         let mut timeline = self;
@@ -1524,12 +1661,12 @@ impl Timeline {
             // The function should have updated 'state'
             //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
             match result {
-                ValueReconstructResult::Complete => return Ok(()),
+                ValueReconstructResult::Complete => return PageReconstructResult::Success(()),
                 ValueReconstructResult::Continue => {
                     // If we reached an earlier cached page image, we're done.
                     if cont_lsn == cached_lsn + 1 {
                         self.metrics.materialized_page_cache_hit_counter.inc_by(1);
-                        return Ok(());
+                        return PageReconstructResult::Success(());
                     }
                     if prev_lsn <= cont_lsn {
                         // Didn't make any progress in last iteration. Error out to avoid
@@ -1562,7 +1699,10 @@ impl Timeline {
                     timeline.ancestor_lsn,
                     cont_lsn
                 );
-                let ancestor = timeline.get_ancestor_timeline()?;
+                let ancestor = match timeline.get_ancestor_timeline() {
+                    Ok(timeline) => timeline,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
                 timeline_owned = ancestor;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
@@ -1580,11 +1720,14 @@ impl Timeline {
                     // Get all the data needed to reconstruct the page version from this layer.
                     // But if we have an older cached page image, no need to go past that.
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = open_layer.get_value_reconstruct_data(
+                    result = match open_layer.get_value_reconstruct_data(
                         key,
                         lsn_floor..cont_lsn,
                         reconstruct_state,
-                    )?;
+                    ) {
+                        Ok(result) => result,
+                        Err(e) => return PageReconstructResult::from(e),
+                    };
                     cont_lsn = lsn_floor;
                     traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                     continue;
@@ -1595,11 +1738,14 @@ impl Timeline {
                 if cont_lsn > start_lsn {
                     //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = frozen_layer.get_value_reconstruct_data(
+                    result = match frozen_layer.get_value_reconstruct_data(
                         key,
                         lsn_floor..cont_lsn,
                         reconstruct_state,
-                    )?;
+                    ) {
+                        Ok(result) => result,
+                        Err(e) => return PageReconstructResult::from(e),
+                    };
                     cont_lsn = lsn_floor;
                     traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                     continue 'outer;
@@ -1609,12 +1755,24 @@ impl Timeline {
             if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
                 //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
 
+                // If it's a remote layer, the caller can do the download and retry.
+                if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) {
+                    info!("need remote layer {}", layer.traversal_id());
+                    return PageReconstructResult::NeedsDownload(
+                        Weak::clone(&timeline.myself),
+                        Arc::downgrade(&remote_layer),
+                    );
+                }
+
                 let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = layer.get_value_reconstruct_data(
+                result = match layer.get_value_reconstruct_data(
                     key,
                     lsn_floor..cont_lsn,
                     reconstruct_state,
-                )?;
+                ) {
+                    Ok(result) => result,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
                 cont_lsn = lsn_floor;
                 traversal_path.push((result, cont_lsn, layer.traversal_id()));
             } else if timeline.ancestor_timeline.is_some() {
@@ -1840,9 +1998,11 @@ impl Timeline {
         let lsn_range = frozen_layer.get_lsn_range();
         let layer_paths_to_upload =
             if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
-                let (partitioning, _lsn) =
-                    self.repartition(self.initdb_lsn, self.get_compaction_target_size())?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)?
+                let (partitioning, _lsn) = self
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size())
+                    .await?;
+                self.create_image_layers(&partitioning, self.initdb_lsn, true)
+                    .await?
             } else {
                 // normal case, write out a L0 delta layer file.
                 let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
@@ -1979,7 +2139,7 @@ impl Timeline {
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
 
-        self.metrics.current_physical_size_gauge.add(sz);
+        self.metrics.resident_physical_size_gauge.add(sz);
         // update metrics
         self.metrics.num_persistent_files_created.inc_by(1);
         self.metrics.persistent_bytes_written.inc_by(sz);
@@ -1987,15 +2147,28 @@ impl Timeline {
         Ok((new_delta_filename, LayerFileMetadata::new(sz)))
     }
 
-    fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> {
-        let mut partitioning_guard = self.partitioning.lock().unwrap();
-        if partitioning_guard.1 == Lsn(0)
-            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
+    async fn repartition(
+        &self,
+        lsn: Lsn,
+        partition_size: u64,
+    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
-            let keyspace = self.collect_keyspace(lsn)?;
-            let partitioning = keyspace.partition(partition_size);
+            let partitioning_guard = self.partitioning.lock().unwrap();
+            if partitioning_guard.1 != Lsn(0)
+                && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
+            {
+                // no repartitioning needed
+                return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+            }
+        }
+        let keyspace = self.collect_keyspace(lsn).await?;
+        let partitioning = keyspace.partition(partition_size);
+
+        let mut partitioning_guard = self.partitioning.lock().unwrap();
+        if lsn > partitioning_guard.1 {
             *partitioning_guard = (partitioning, lsn);
-            return Ok((partitioning_guard.0.clone(), lsn));
+        } else {
+            warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless");
         }
         Ok((partitioning_guard.0.clone(), partitioning_guard.1))
     }
@@ -2041,7 +2214,7 @@ impl Timeline {
         Ok(false)
     }
 
-    fn create_image_layers(
+    async fn create_image_layers(
         &self,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
@@ -2068,7 +2241,7 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
-                        let img = match self.get(key, lsn) {
+                        let img = match self.get_download(key, lsn).await {
                             Ok(img) => img,
                             Err(err) => {
                                 // If we fail to reconstruct a VM or FSM page, we can zero the
@@ -2131,7 +2304,9 @@ impl Timeline {
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
-            self.metrics.current_physical_size_gauge.add(metadata.len());
+            self.metrics
+                .resident_physical_size_gauge
+                .add(metadata.len());
             layers.insert_historic(Arc::new(l));
         }
         drop(layers);
@@ -2443,7 +2618,9 @@ impl Timeline {
             }
 
             // update the timeline's physical size
-            self.metrics.current_physical_size_gauge.add(metadata.len());
+            self.metrics
+                .resident_physical_size_gauge
+                .add(metadata.len());
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
@@ -2456,7 +2633,7 @@ impl Timeline {
         for l in deltas_to_compact {
             if let Some(path) = l.local_path() {
                 self.metrics
-                    .current_physical_size_gauge
+                    .resident_physical_size_gauge
                     .sub(path.metadata()?.len());
             }
             layer_names_to_delete.push(l.filename());
@@ -2526,7 +2703,10 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self.find_lsn_for_timestamp(pitr_timestamp)? {
+                match self
+                    .find_lsn_for_timestamp(pitr_timestamp)
+                    .no_ondemand_download()?
+                {
                     LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
                     LsnForTimestamp::Future(lsn) => {
                         debug!("future({})", lsn);
@@ -2743,11 +2923,11 @@ impl Timeline {
             for doomed_layer in layers_to_remove {
                 if let Some(path) = doomed_layer.local_path() {
                     self.metrics
-                        .current_physical_size_gauge
+                        .resident_physical_size_gauge
                         .sub(path.metadata()?.len());
                 }
                 layer_names_to_delete.push(doomed_layer.filename());
-                doomed_layer.delete()?;
+                doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
                 layers.remove_historic(doomed_layer);
                 result.layers_removed += 1;
             }
@@ -2778,7 +2958,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         mut data: ValueReconstructState,
-    ) -> anyhow::Result<Bytes> {
+    ) -> PageReconstructResult<Bytes> {
         // Perform WAL redo if needed
         data.records.reverse();
 
@@ -2790,9 +2970,11 @@ impl Timeline {
                     key,
                     img_lsn
                 );
-                Ok(img.clone())
+                PageReconstructResult::Success(img.clone())
             } else {
-                bail!("base image for {} at {} not found", key, request_lsn);
+                PageReconstructResult::from(anyhow!(
+                    "base image for {key} at {request_lsn} not found"
+                ))
             }
         } else {
             // We need to do WAL redo.
@@ -2800,12 +2982,12 @@ impl Timeline {
             // If we don't have a base image, then the oldest WAL record better initialize
             // the page
             if data.img.is_none() && !data.records.first().unwrap().1.will_init() {
-                bail!(
+                PageReconstructResult::from(anyhow!(
                     "Base image for {} at {} not found, but got {} WAL records",
                     key,
                     request_lsn,
                     data.records.len()
-                );
+                ))
             } else {
                 if data.img.is_some() {
                     trace!(
@@ -2820,14 +3002,18 @@ impl Timeline {
 
                 let last_rec_lsn = data.records.last().unwrap().0;
 
-                let img = self
+                let img = match self
                     .walredo_mgr
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .context("Failed to reconstruct a page image:")?;
+                    .context("Failed to reconstruct a page image:")
+                {
+                    Ok(img) => img,
+                    Err(e) => return PageReconstructResult::from(e),
+                };
 
                 if img.len() == page_cache::PAGE_SZ {
                     let cache = page_cache::get();
-                    cache
+                    if let Err(e) = cache
                         .memorize_materialized_page(
                             self.tenant_id,
                             self.timeline_id,
@@ -2835,30 +3021,324 @@ impl Timeline {
                             last_rec_lsn,
                             &img,
                         )
-                        .context("Materialized page memoization failed")?;
+                        .context("Materialized page memoization failed")
+                    {
+                        return PageReconstructResult::from(e);
+                    }
                 }
 
-                Ok(img)
+                PageReconstructResult::Success(img)
             }
         }
     }
+
+    /// Download a layer file from remote storage and insert it into the layer map.
+    ///
+    /// It's safe to call this function for the same layer concurrently. In that case:
+    /// - If the layer has already been downloaded, `OK(...)` is returned.
+    /// - If the layer is currently being downloaded, we wait until that download succeeded / failed.
+    ///     - If it succeeded, we return `Ok(...)`.
+    ///     - If it failed, we or another concurrent caller will initiate a new download attempt.
+    ///
+    /// Download errors are classified and retried if appropriate by the underlying RemoteTimelineClient function.
+    /// It has an internal limit for the maximum number of retries and prints appropriate log messages.
+    /// If we exceed the limit, it returns an error, and this function passes it through.
+    /// The caller _could_ retry further by themselves by calling this function again, but _should not_ do it.
+    /// The reason is that they cannot distinguish permanent errors from temporary ones, whereas
+    /// the underlying RemoteTimelineClient can.
+    ///
+    /// There is no internal timeout or slowness detection.
+    /// If the caller has a deadline or needs a timeout, they can simply stop polling:
+    /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
+    /// So, the current download attempt will run to completion even if we stop polling.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
+    pub async fn download_remote_layer(
+        self: Arc<Self>,
+        remote_layer: Arc<RemoteLayer>,
+    ) -> anyhow::Result<()> {
+        let permit = match Arc::clone(&remote_layer.ongoing_download)
+            .acquire_owned()
+            .await
+        {
+            Ok(permit) => permit,
+            Err(_closed) => {
+                info!("download of layer has already finished");
+                return Ok(());
+            }
+        };
+
+        let (sender, receiver) = tokio::sync::oneshot::channel();
+        // Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline.
+        task_mgr::spawn(
+            &tokio::runtime::Handle::current(),
+            TaskKind::RemoteDownloadTask,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            &format!("download layer {}", remote_layer.short_id()),
+            false,
+            async move {
+                let remote_client = self.remote_client.as_ref().unwrap();
+
+                // Does retries + exponential back-off internally.
+                // When this fails, don't layer further retry attempts here.
+                let result = remote_client
+                    .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
+                    .await;
+
+                if let Ok(size) = &result {
+                    // XXX the temp file is still around in Err() case
+                    // and consumes space until we clean up upon pageserver restart.
+                    self.metrics.resident_physical_size_gauge.add(*size);
+
+                    // Download complete. Replace the RemoteLayer with the corresponding
+                    // Delta- or ImageLayer in the layer map.
+                    let new_layer = remote_layer.create_downloaded_layer(self.conf, *size);
+                    let mut layers = self.layers.write().unwrap();
+                    {
+                        let l: Arc<dyn PersistentLayer> = remote_layer.clone();
+                        layers.remove_historic(l);
+                    }
+                    layers.insert_historic(new_layer);
+                    drop(layers);
+
+                    // Now that we've inserted the download into the layer map,
+                    // close the semaphore. This will make other waiters for
+                    // this download return Ok(()).
+                    assert!(!remote_layer.ongoing_download.is_closed());
+                    remote_layer.ongoing_download.close();
+                } else {
+                    // Keep semaphore open. We'll drop the permit at the end of the function.
+                }
+
+                // Don't treat it as an error if the task that triggered the download
+                // is no longer interested in the result.
+                sender.send(result.map(|_sz| ())).ok();
+
+                // In case we failed and there are other waiters, this will make one
+                // of them retry the download in a new task.
+                // XXX: This resets the exponential backoff because it's a new call to
+                // download_layer file.
+                drop(permit);
+
+                Ok(())
+            },
+        );
+
+        receiver.await.context("download task cancelled")?
+    }
+
+    pub async fn spawn_download_all_remote_layers(
+        self: Arc<Self>,
+    ) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
+        let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap();
+        if let Some(st) = &*status_guard {
+            match &st.state {
+                DownloadRemoteLayersTaskState::Running => {
+                    return Err(st.clone());
+                }
+                DownloadRemoteLayersTaskState::ShutDown
+                | DownloadRemoteLayersTaskState::Completed => {
+                    *status_guard = None;
+                }
+            }
+        }
+
+        let self_clone = Arc::clone(&self);
+        let task_id = task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::DownloadAllRemoteLayers,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "download all remote layers task",
+            false,
+            async move {
+                self_clone.download_all_remote_layers().await;
+                let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
+                 match &mut *status_guard {
+                    None => {
+                        warn!("tasks status is supposed to be Some(), since we are running");
+                    }
+                    Some(st) => {
+                        let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap());
+                        if st.task_id != exp_task_id {
+                            warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id);
+                        } else {
+                            st.state = DownloadRemoteLayersTaskState::Completed;
+                        }
+                    }
+                };
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id))
+        );
+
+        let initial_info = DownloadRemoteLayersTaskInfo {
+            task_id: format!("{task_id}"),
+            state: DownloadRemoteLayersTaskState::Running,
+            total_layer_count: 0,
+            successful_download_count: 0,
+            failed_download_count: 0,
+        };
+        *status_guard = Some(initial_info.clone());
+
+        Ok(initial_info)
+    }
+
+    async fn download_all_remote_layers(self: &Arc<Self>) {
+        let mut downloads: FuturesUnordered<_> = {
+            let layers = self.layers.read().unwrap();
+            layers
+                .iter_historic_layers()
+                .filter_map(|l| l.downcast_remote_layer())
+                .map({
+                    |l| {
+                        let self_clone = Arc::clone(self);
+                        self_clone.download_remote_layer(l)
+                    }
+                })
+                .collect()
+        };
+
+        macro_rules! lock_status {
+            ($st:ident) => {
+                let mut st = self.download_all_remote_layers_task_info.write().unwrap();
+                let st = st
+                    .as_mut()
+                    .expect("this function is only called after the task has been spawned");
+                assert_eq!(
+                    st.task_id,
+                    format!(
+                        "{}",
+                        task_mgr::current_task_id().expect("we run inside a task_mgr task")
+                    )
+                );
+                let $st = st;
+            };
+        }
+
+        {
+            lock_status!(st);
+            st.total_layer_count = downloads.len().try_into().unwrap();
+        }
+        loop {
+            tokio::select! {
+                dl = downloads.next() => {
+                    lock_status!(st);
+                    match dl {
+                        None => break,
+                        Some(Ok(())) => {
+                            st.successful_download_count += 1;
+                        },
+                        Some(Err(e)) => {
+                            error!(error = %e, "layer download failed");
+                            st.failed_download_count += 1;
+                        }
+                    }
+                }
+                _ = task_mgr::shutdown_watcher() => {
+                    // Kind of pointless to watch for shutdowns here,
+                    // as download_remote_layer spawns other task_mgr tasks internally.
+                    lock_status!(st);
+                    st.state = DownloadRemoteLayersTaskState::ShutDown;
+                }
+            }
+        }
+        {
+            lock_status!(st);
+            st.state = DownloadRemoteLayersTaskState::Completed;
+        }
+    }
+
+    pub fn get_download_all_remote_layers_task_info(&self) -> Option<DownloadRemoteLayersTaskInfo> {
+        self.download_all_remote_layers_task_info
+            .read()
+            .unwrap()
+            .clone()
+    }
 }
 
-/// An error happened in a get() operation.
-#[derive(thiserror::Error)]
-pub enum PageReconstructError {
-    #[error(transparent)]
-    Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
-
-    #[error(transparent)]
-    WalRedo(#[from] crate::walredo::WalRedoError),
-}
-
-impl std::fmt::Debug for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
-        match self {
-            PageReconstructError::Other(err) => err.fmt(f),
-            PageReconstructError::WalRedo(err) => err.fmt(f),
+/// Helper function to deal with [`PageReconstructResult`].
+///
+/// Takes a sync closure that returns a [`PageReconstructResult`].
+/// If it is [`PageReconstructResult::NeedsDownload`],
+/// do the download and retry the closure.
+///
+/// ### Background
+///
+/// This is a crutch to make on-demand downloads efficient in
+/// our async-sync-async sandwich codebase. Some context:
+///
+/// - The code that does the downloads uses async Rust.
+/// - The code that initiates download is many levels of sync Rust.
+/// - The sync code must wait for the download to finish to
+///   make further progress.
+/// - The sync code is invoked directly from async functions upstack.
+///
+/// Example (there are also much worse ones where the sandwich is taller)
+///
+///   async handle_get_page_at_lsn_request        page_service.rs
+///     sync get_rel_page_at_lsn                  timeline.rs
+///       sync timeline.get                       timeline.rs
+///         sync get_reconstruct_data             timeline.rs
+///           async download_remote_layer         timeline.rs
+///
+/// It is not possible to Timeline::download_remote_layer().await within
+/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`]
+/// which contains references to the [`Timeline`] and [`RemoteLayer`].
+/// We bubble that error upstack to the async code, which can then call
+/// `Timeline::download_remote_layer().await`.
+/// That is _efficient_ because tokio can use the same OS thread to do
+/// other work while we're waiting for the download.
+///
+/// It is a deliberate decision to use a new result type to communicate
+/// the need for download instead of adding another variant to [`PageReconstructError`].
+/// The reason is that with the latter approach, any place that does
+/// `?` on a `Result<T, PageReconstructError>` will implicitly ignore the
+/// need for download. We want that to be explicit, so that
+/// - the code base becomes greppable for places that don't do a download
+/// - future code changes will need to explicilty address for on-demand download
+///
+/// Alternatives to consider in the future:
+///
+/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread
+///   and use it to block_on the download_remote_layer future.
+///   That is obviously inefficient as it creates one thread per download.
+/// - Convert everything to async. The problem here is that the sync
+///   functions are used by many other sync functions. So, the scope
+///   creep of such a conversion is tremendous.
+/// - Compromise between the two: implement async functions for each sync
+///   function. Switch over the hot code paths (GetPage()) to use the
+///   async path, so that the hot path doesn't  spawn threads. Other code
+///   paths would remain sync initially, and get converted to async over time.
+///
+pub async fn with_ondemand_download<F, T>(mut f: F) -> Result<T, anyhow::Error>
+where
+    F: Send + FnMut() -> PageReconstructResult<T>,
+    T: Send,
+{
+    loop {
+        let closure_result = f();
+        match closure_result {
+            PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => {
+                // if the timeline is gone, it has likely been deleted / tenant detached
+                let tl = weak_timeline.upgrade().context("timeline is gone")?;
+                // if the remote layer got removed, retry the function, it might succeed now
+                let remote_layer = match weak_remote_layer.upgrade() {
+                    None => {
+                        info!("remote layer is gone, retrying closure");
+                        continue;
+                    }
+                    Some(l) => l,
+                };
+                // Does retries internally
+                tl.download_remote_layer(remote_layer).await?;
+                // Download successful, retry the closure
+                continue;
+            }
+            PageReconstructResult::Success(closure_value) => return Ok(closure_value),
+            PageReconstructResult::Error(e) => {
+                return Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
+            }
         }
     }
 }
@@ -2868,7 +3348,7 @@ impl std::fmt::Debug for PageReconstructError {
 fn layer_traversal_error(
     msg: String,
     path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
-) -> Result<(), PageReconstructError> {
+) -> PageReconstructResult<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
     let mut msg_iter = path
@@ -2885,7 +3365,7 @@ fn layer_traversal_error(
 
     // Append all subsequent traversals, and the error message 'msg', as contexts.
     let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    Err(PageReconstructError::Other(msg))
+    PageReconstructResult::from(msg)
 }
 
 /// Various functions to mutate the timeline.
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 46e4acd50c..fb216123c1 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -12,7 +12,7 @@
 //!
 use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
 use once_cell::sync::OnceCell;
-use std::fs::{File, OpenOptions};
+use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -240,6 +240,10 @@ impl VirtualFile {
         self.with_file("fsync", |file| file.sync_all())?
     }
 
+    pub fn metadata(&self) -> Result<fs::Metadata, Error> {
+        self.with_file("metadata", |file| file.metadata())?
+    }
+
     /// Helper function that looks up the underlying File for this VirtualFile,
     /// opening it and evicting some other File if necessary. It calls 'func'
     /// with the physical File.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e8a2e99f06..e3453dfe06 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,7 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::pgdatadir_mapping::*;
+use crate::tenant::PageReconstructResult;
 use crate::tenant::Timeline;
+use crate::try_no_ondemand_download;
+use crate::try_page_reconstruct_result as try_prr;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -52,10 +55,10 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result<WalIngest> {
+    pub fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint)?;
+        let checkpoint_bytes = timeline.get_checkpoint(startpoint).no_ondemand_download()?;
         let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
@@ -80,10 +83,12 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         modification: &mut DatadirModification,
         decoded: &mut DecodedWALRecord,
-    ) -> Result<()> {
+    ) -> PageReconstructResult<()> {
         modification.lsn = lsn;
-        decode_wal_record(recdata, decoded, self.timeline.pg_version)
-            .context("failed decoding wal record")?;
+        try_prr!(
+            decode_wal_record(recdata, decoded, self.timeline.pg_version)
+                .context("failed decoding wal record")
+        );
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -98,7 +103,7 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            self.ingest_heapam_record(&mut buf, modification, decoded)?;
+            try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded));
         }
         // Handle other special record types
         if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -106,13 +111,13 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(modification, &create)?;
+            try_prr!(self.ingest_xlog_smgr_create(modification, &create));
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate)?;
+            try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate));
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
                 "handle RM_DBASE_ID for Postgres version {:?}",
@@ -125,14 +130,14 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    self.ingest_xlog_dbase_create(modification, &createdb)?;
+                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
+                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -148,14 +153,14 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb)?;
+                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
+                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
                     }
                 }
             }
@@ -167,38 +172,38 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
+                try_prr!(self.put_slru_page_image(
                     modification,
                     SlruKind::Clog,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                ));
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec)?;
+                try_prr!(self.ingest_clog_truncate_record(modification, &xlrec));
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
             let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
             if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                self.ingest_xact_record(
+                try_prr!(self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
-                )?;
+                ));
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                 || info == pg_constants::XLOG_XACT_ABORT_PREPARED
             {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                self.ingest_xact_record(
+                try_prr!(self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                )?;
+                ));
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
                 trace!(
                     "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
@@ -206,9 +211,10 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                modification.drop_twophase_file(parsed_xact.xid)?;
+                try_prr!(modification.drop_twophase_file(parsed_xact.xid));
             } else if info == pg_constants::XLOG_XACT_PREPARE {
-                modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
+                try_prr!(modification
+                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..])));
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -217,34 +223,34 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
+                try_prr!(self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactOffsets,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                ));
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
+                try_prr!(self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactMembers,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                )?;
+                ));
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                 let xlrec = XlMultiXactCreate::decode(&mut buf);
-                self.ingest_multixact_create_record(modification, &xlrec)?;
+                try_prr!(self.ingest_multixact_create_record(modification, &xlrec));
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(modification, &xlrec)?;
+                try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec));
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(modification, &xlrec, decoded)?;
+            try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded));
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
             if info == pg_constants::XLOG_NEXTOID {
@@ -258,7 +264,9 @@ impl<'a> WalIngest<'a> {
             {
                 let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
                 buf.copy_to_slice(&mut checkpoint_bytes);
-                let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+                let xlog_checkpoint = try_prr!(
+                    CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint")
+                );
                 trace!(
                     "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
                     xlog_checkpoint.oldestXid,
@@ -279,22 +287,23 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(modification, lsn, decoded, blk)?;
+            try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk));
         }
 
         // If checkpoint data was updated, store the new version in the repository
         if self.checkpoint_modified {
-            let new_checkpoint_bytes = self.checkpoint.encode()?;
+            let new_checkpoint_bytes =
+                try_prr!(self.checkpoint.encode().context("encode checkpoint"));
 
-            modification.put_checkpoint(new_checkpoint_bytes)?;
+            try_prr!(modification.put_checkpoint(new_checkpoint_bytes));
             self.checkpoint_modified = false;
         }
 
         // Now that this record has been fully handled, including updating the
         // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit()?;
+        try_prr!(modification.commit());
 
-        Ok(())
+        PageReconstructResult::Success(())
     }
 
     fn ingest_decoded_block(
@@ -303,7 +312,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
-    ) -> Result<()> {
+    ) -> PageReconstructResult<()> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
             dbnode: blk.rnode_dbnode,
@@ -323,7 +332,7 @@ impl<'a> WalIngest<'a> {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
         // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
+            && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version))
         {
             // Extract page image from FPI record
             let img_len = blk.bimg_len as usize;
@@ -345,15 +354,20 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?;
+            try_no_ondemand_download!(self.put_rel_page_image(
+                modification,
+                rel,
+                blk.blkno,
+                image.freeze()
+            ));
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec)?;
+            try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec));
         }
-        Ok(())
+        PageReconstructResult::Success(())
     }
 
     fn ingest_heapam_record(
@@ -505,7 +519,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification,
         rec: &XlCreateDatabase,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
         let tablespace_id = rec.tablespace_id;
         let src_db_id = rec.src_db_id;
@@ -520,14 +534,16 @@ impl<'a> WalIngest<'a> {
 
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)?;
+            .list_rels(src_tablespace_id, src_db_id, req_lsn)
+            .no_ondemand_download()?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
 
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?;
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+            .no_ondemand_download()?;
         modification.put_relmap_file(tablespace_id, db_id, filemap)?;
 
         let mut num_rels_copied = 0;
@@ -536,7 +552,10 @@ impl<'a> WalIngest<'a> {
             assert_eq!(src_rel.spcnode, src_tablespace_id);
             assert_eq!(src_rel.dbnode, src_db_id);
 
-            let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?;
+            let nblocks = modification
+                .tline
+                .get_rel_size(src_rel, req_lsn, true)
+                .no_ondemand_download()?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
                 dbnode: db_id,
@@ -553,7 +572,8 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?;
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                    .no_ondemand_download()?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
             }
@@ -657,7 +677,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
         let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -713,7 +733,11 @@ impl<'a> WalIngest<'a> {
                     relnode: xnode.relnode,
                 };
                 let last_lsn = self.timeline.get_last_record_lsn();
-                if modification.tline.get_rel_exists(rel, last_lsn, true)? {
+                if modification
+                    .tline
+                    .get_rel_exists(rel, last_lsn, true)
+                    .no_ondemand_download()?
+                {
                     self.put_rel_drop(modification, rel)?;
                 }
             }
@@ -725,7 +749,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification,
         xlrec: &XlClogTruncate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
             xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
@@ -767,7 +791,8 @@ impl<'a> WalIngest<'a> {
         let req_lsn = modification.tline.get_last_record_lsn();
         for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)?
+            .list_slru_segments(SlruKind::Clog, req_lsn)
+            .no_ondemand_download()?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
@@ -923,10 +948,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)?;
-        modification.put_rel_page_image(rel, blknum, img)?;
-        Ok(())
+    ) -> PageReconstructResult<()> {
+        try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum));
+        try_prr!(modification.put_rel_page_image(rel, blknum, img));
+        PageReconstructResult::Success(())
     }
 
     fn put_rel_wal_record(
@@ -936,7 +961,8 @@ impl<'a> WalIngest<'a> {
         blknum: BlockNumber,
         rec: NeonWalRecord,
     ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)?;
+        self.handle_rel_extend(modification, rel, blknum)
+            .no_ondemand_download()?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -946,7 +972,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification,
         rel: RelTag,
         nblocks: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         modification.put_rel_truncation(rel, nblocks)?;
         Ok(())
     }
@@ -956,11 +982,17 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result<BlockNumber> {
-        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? {
+    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self
+            .timeline
+            .get_rel_exists(rel, lsn, true)
+            .no_ondemand_download()?
+        {
             0
         } else {
-            self.timeline.get_rel_size(rel, lsn, true)?
+            self.timeline
+                .get_rel_size(rel, lsn, true)
+                .no_ondemand_download()?
         };
         Ok(nblocks)
     }
@@ -970,30 +1002,31 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> Result<()> {
+    ) -> PageReconstructResult<()> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? {
-            // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0)?;
-            0
-        } else {
-            self.timeline.get_rel_size(rel, last_lsn, true)?
-        };
+        let old_nblocks =
+            if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) {
+                // create it with 0 size initially, the logic below will extend it
+                try_prr!(modification.put_rel_creation(rel, 0));
+                0
+            } else {
+                try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true))
+            };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            modification.put_rel_extend(rel, new_nblocks)?;
+            try_prr!(modification.put_rel_extend(rel, new_nblocks));
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
+                try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone()));
             }
         }
-        Ok(())
+        PageReconstructResult::Success(())
     }
 
     fn put_slru_page_image(
@@ -1015,7 +1048,7 @@ impl<'a> WalIngest<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
         // a lot less frequently.
@@ -1027,13 +1060,16 @@ impl<'a> WalIngest<'a> {
         let last_lsn = self.timeline.get_last_record_lsn();
         let old_nblocks = if !self
             .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)?
+            .get_slru_segment_exists(kind, segno, last_lsn)
+            .no_ondemand_download()?
         {
             // create it with 0 size initially, the logic below will extend it
             modification.put_slru_segment_creation(kind, segno, 0)?;
             0
         } else {
-            self.timeline.get_slru_segment_size(kind, segno, last_lsn)?
+            self.timeline
+                .get_slru_segment_size(kind, segno, last_lsn)
+                .no_ondemand_download()?
         };
 
         if new_nblocks > old_nblocks {
@@ -1099,58 +1135,103 @@ mod tests {
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A)?;
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .no_ondemand_download()?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
+            .no_ondemand_download()?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
+            .no_ondemand_download()?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
+            .no_ondemand_download()?;
         m.commit()?;
 
         assert_current_logical_size(&*tline, Lsn(0x50));
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false);
-        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err());
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false)
+                .no_ondemand_download()?,
+            false
+        );
+        assert!(tline
+            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .no_ondemand_download()
+            .is_err());
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            1
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            3
+        );
 
         // Check page contents at each LSN
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 2 at 5")
         );
 
@@ -1161,20 +1242,36 @@ mod tests {
         assert_current_logical_size(&*tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false)
+                .no_ondemand_download()?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            3
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 2 at 5")
         );
 
@@ -1182,35 +1279,62 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x68));
         walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x68), false)
+                .no_ondemand_download()?,
+            0
+        );
 
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
+            .no_ondemand_download()?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2);
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x70), false)
+                .no_ondemand_download()?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
+                .no_ondemand_download()?,
             ZERO_PAGE
         );
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1")
         );
 
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
+            .no_ondemand_download()?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            1501
+        );
         for blk in 2..1500 {
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
+                    .no_ondemand_download()?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
-            tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?,
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
+                .no_ondemand_download()?,
             TEST_IMG("foo blk 1500")
         );
 
@@ -1226,12 +1350,24 @@ mod tests {
         let mut walingest = init_walingest_test(&*tline)?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .no_ondemand_download()?;
         m.commit()?;
 
         // Check that rel exists and size is correct
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
@@ -1239,19 +1375,36 @@ mod tests {
         m.commit()?;
 
         // Check that rel is not visible anymore
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false)
+                .no_ondemand_download()?,
+            false
+        );
 
         // FIXME: should fail
         //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none());
 
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
-        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
+        walingest
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
+            .no_ondemand_download()?;
         m.commit()?;
 
         // Check that rel exists and size is correct
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x40), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         Ok(())
     }
@@ -1270,23 +1423,45 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x20));
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .no_ondemand_download()?;
         }
         m.commit()?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false);
-        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err());
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false)
+                .no_ondemand_download()?,
+            false
+        );
+        assert!(tline
+            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .no_ondemand_download()
+            .is_err());
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false)
+                .no_ondemand_download()?,
+            relsize
+        );
 
         // Check relation content
         for blkno in 0..relsize {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1298,24 +1473,38 @@ mod tests {
         m.commit()?;
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false)
+                .no_ondemand_download()?,
+            1
+        );
 
         for blkno in 0..1 {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
 
         // should still see all blocks with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false)
+                .no_ondemand_download()?,
+            relsize
+        );
         for blkno in 0..relsize {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1326,18 +1515,32 @@ mod tests {
         let mut m = tline.begin_modification(lsn);
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .no_ondemand_download()?;
         }
         m.commit()?;
 
-        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize);
+        assert_eq!(
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            true
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false)
+                .no_ondemand_download()?,
+            relsize
+        );
         // Check relation content
         for blkno in 0..relsize {
             let lsn = Lsn(0x80);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
-                tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?,
+                tline
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
+                    .no_ondemand_download()?,
                 TEST_IMG(&data)
             );
         }
@@ -1358,14 +1561,18 @@ mod tests {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
-            walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?;
+            walingest
+                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
+                .no_ondemand_download()?;
             m.commit()?;
         }
 
         assert_current_logical_size(&*tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
             RELSEG_SIZE + 1
         );
 
@@ -1374,7 +1581,12 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
+            RELSEG_SIZE
+        );
         assert_current_logical_size(&*tline, Lsn(lsn));
 
         // Truncate another block
@@ -1383,7 +1595,9 @@ mod tests {
         walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                .no_ondemand_download()?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&*tline, Lsn(lsn));
@@ -1397,7 +1611,9 @@ mod tests {
             walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
             m.commit()?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?,
+                tline
+                    .get_rel_size(TESTREL_A, Lsn(lsn), false)
+                    .no_ondemand_download()?,
                 size as BlockNumber
             );
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index a65703bca9..aeb7601af7 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -407,7 +407,7 @@ impl WalreceiverState {
                 .await
                 .context("walreceiver connection handling failure")
             }
-            .instrument(info_span!("walreceiver_connection", id = %id))
+            .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
         });
 
         let now = Utc::now().naive_utc();
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index 5b7e60aa5e..cc318cccc8 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -20,7 +20,9 @@ use tokio::{pin, select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
+use crate::{
+    metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
+};
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -248,9 +250,16 @@ pub async fn handle_walreceiver_connection(
                         // at risk of hitting a deadlock.
                         ensure!(lsn.is_aligned());
 
-                        walingest
-                            .ingest_record(recdata, lsn, &mut modification, &mut decoded)
-                            .context("could not ingest record at {lsn}")?;
+                        with_ondemand_download(|| {
+                            walingest.ingest_record(
+                                recdata.clone(),
+                                lsn,
+                                &mut modification,
+                                &mut decoded,
+                            )
+                        })
+                        .await
+                        .with_context(|| format!("could not ingest record at {lsn}"))?;
 
                         fail_point!("walreceiver-after-ingest");
 
diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs
index 38fb9a4247..7581140934 100644
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -1,6 +1,7 @@
 //!
 //! Functions for parsing WAL records.
 //!
+
 use anyhow::Result;
 use bytes::{Buf, Bytes};
 use postgres_ffi::pg_constants;
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 8ea3f13bf5..d83a74ae14 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -318,14 +318,8 @@ def remote_consistent_lsn(
     detail = pageserver_http_client.timeline_detail(tenant, timeline)
 
     lsn_str = detail["remote_consistent_lsn"]
-    if lsn_str is None:
-        # No remote information at all. This happens right after creating
-        # a timeline, before any part of it has been uploaded to remote
-        # storage yet.
-        return 0
-    else:
-        assert isinstance(lsn_str, str)
-        return lsn_from_hex(lsn_str)
+    assert isinstance(lsn_str, str)
+    return lsn_from_hex(lsn_str)
 
 
 def wait_for_upload(
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 5fe6c43528..9236137d19 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -49,7 +49,7 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
-    "pageserver_current_physical_size",
+    "pageserver_resident_physical_size",
     "pageserver_getpage_reconstruct_seconds_bucket",
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d52ca38447..5b00ebdea7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -26,6 +26,7 @@ import asyncpg
 import backoff  # type: ignore
 import boto3
 import jwt
+import prometheus_client
 import psycopg2
 import pytest
 import requests
@@ -41,6 +42,7 @@ from fixtures.utils import (
     get_self_dir,
     subprocess_capture,
 )
+from prometheus_client.parser import text_string_to_metric_families
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -1204,8 +1206,22 @@ class PageserverHttpClient(requests.Session):
         # there are no tests for those right now.
         return size
 
-    def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]:
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline")
+    def timeline_list(
+        self,
+        tenant_id: TenantId,
+        include_non_incremental_logical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
+    ) -> List[Dict[str, Any]]:
+
+        params = {}
+        if include_non_incremental_logical_size:
+            params["include-non-incremental-logical-size"] = "yes"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "yes"
+
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
+        )
         self.verbose_error(res)
         res_json = res.json()
         assert isinstance(res_json, list)
@@ -1239,13 +1255,13 @@ class PageserverHttpClient(requests.Session):
         tenant_id: TenantId,
         timeline_id: TimelineId,
         include_non_incremental_logical_size: bool = False,
-        include_non_incremental_physical_size: bool = False,
+        include_timeline_dir_layer_file_size_sum: bool = False,
     ) -> Dict[Any, Any]:
         params = {}
         if include_non_incremental_logical_size:
             params["include-non-incremental-logical-size"] = "yes"
-        if include_non_incremental_physical_size:
-            params["include-non-incremental-physical-size"] = "yes"
+        if include_timeline_dir_layer_file_size_sum:
+            params["include-timeline-dir-layer-file-size-sum"] = "yes"
 
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
@@ -1320,11 +1336,88 @@ class PageserverHttpClient(requests.Session):
         res_json = res.json()
         assert res_json is None
 
+    def timeline_spawn_download_remote_layers(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> dict[str, Any]:
+
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+        return res_json
+
+    def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        spawn_response: dict[str, Any],
+        poll_state=None,
+    ) -> None | dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is not None
+        assert isinstance(res_json, dict)
+
+        # assumption in this API client here is that nobody else spawns the task
+        assert res_json["task_id"] == spawn_response["task_id"]
+
+        if poll_state is None or res_json["state"] == poll_state:
+            return res_json
+        return None
+
+    def timeline_download_remote_layers(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        errors_ok=False,
+        at_least_one_download=True,
+    ):
+        res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id)
+        while True:
+            completed = self.timeline_poll_download_remote_layers_status(
+                tenant_id, timeline_id, res, poll_state="Completed"
+            )
+            if not completed:
+                time.sleep(0.1)
+                continue
+            if not errors_ok:
+                assert completed["failed_download_count"] == 0
+            if at_least_one_download:
+                assert completed["successful_download_count"] > 0
+            return completed
+
     def get_metrics(self) -> str:
         res = self.get(f"http://localhost:{self.port}/metrics")
         self.verbose_error(res)
         return res.text
 
+    def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str):
+        raw = self.get_metrics()
+        family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw))
+        [metric] = [m for m in family if m.name == metric_name]
+        [sample] = [
+            s
+            for s in metric.samples
+            if s.labels["tenant_id"] == str(tenant_id)
+            and s.labels["timeline_id"] == str(timeline_id)
+        ]
+        return sample.value
+
+    def get_metric_value(self, name: str) -> Optional[str]:
+        metrics = self.get_metrics()
+        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
+        if len(relevant) == 0:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(relevant) == 1
+        return relevant[0].lstrip(name).strip()
+
 
 @dataclass
 class PageserverPort:
@@ -1622,7 +1715,12 @@ class NeonCli(AbstractNeonCli):
                 pageserver_config_override=self.env.pageserver.config_override,
             )
 
-            res = self.raw_cli(cmd)
+            s3_env_vars = None
+            if self.env.remote_storage is not None and isinstance(
+                self.env.remote_storage, S3Storage
+            ):
+                s3_env_vars = self.env.remote_storage.access_env_vars()
+            res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
             res.check_returncode()
             return res
 
@@ -2996,13 +3094,55 @@ def check_restored_datadir_content(
     assert (mismatch, error) == ([], [])
 
 
-def assert_no_in_progress_downloads_for_tenant(
-    pageserver_http_client: PageserverHttpClient,
-    tenant: TenantId,
+def wait_until(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns successfully, without exception. Returns the
+    last return value from the function.
+    """
+    last_exception = None
+    for i in range(number_of_iterations):
+        try:
+            res = func()
+        except Exception as e:
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            last_exception = e
+            time.sleep(interval)
+            continue
+        return res
+    raise Exception("timed out while waiting for %s" % func) from last_exception
+
+
+def wait_while(number_of_iterations: int, interval: float, func):
+    """
+    Wait until 'func' returns false, or throws an exception.
+    """
+    for i in range(number_of_iterations):
+        try:
+            if not func():
+                return
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            time.sleep(interval)
+            continue
+        except Exception:
+            return
+    raise Exception("timed out while waiting for %s" % func)
+
+
+def assert_tenant_status(
+    pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str
 ):
     tenant_status = pageserver_http_client.tenant_status(tenant)
-    assert tenant_status["has_in_progress_downloads"] is False, tenant_status
-    assert tenant_status["state"] == "Active"
+    log.info(f"tenant_status: {tenant_status}")
+    assert tenant_status["state"] == expected_status, tenant_status
+
+
+def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId):
+    tenants = ps_http.tenant_list()
+    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
+    assert len(matching) < 2
+    if len(matching) == 0:
+        return None
+    return matching[0]
 
 
 def remote_consistent_lsn(
@@ -3010,14 +3150,15 @@ def remote_consistent_lsn(
 ) -> Lsn:
     detail = pageserver_http_client.timeline_detail(tenant, timeline)
 
-    lsn_str = detail["remote_consistent_lsn"]
-    if lsn_str is None:
+    if detail["remote_consistent_lsn"] is None:
         # No remote information at all. This happens right after creating
         # a timeline, before any part of it has been uploaded to remote
         # storage yet.
         return Lsn(0)
-    assert isinstance(lsn_str, str)
-    return Lsn(lsn_str)
+    else:
+        lsn_str = detail["remote_consistent_lsn"]
+        assert isinstance(lsn_str, str)
+        return Lsn(lsn_str)
 
 
 def wait_for_upload(
@@ -3030,6 +3171,7 @@ def wait_for_upload(
     for i in range(20):
         current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
         if current_lsn >= lsn:
+            log.info("wait finished")
             return
         log.info(
             "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 71964f622f..05d5788028 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*Failed to load delta layer.*",
+            ".*Failed to reconstruct the page.*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -87,9 +87,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
         f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
     )
 
-    # Second timeline has no ancestors, only the metadata file and no layer files.
-    # That is checked explicitly in the pageserver, and causes the tenant to be marked
-    # as broken.
+    # Second timeline has no ancestors, only the metadata file and no layer files locally,
+    # and we don't have the remote storage enabled. It is loaded into memory, but getting
+    # the basebackup from it will fail.
     with pytest.raises(
         Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken"
     ) as err:
@@ -97,8 +97,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
     log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
 
     # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
-    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+    with pytest.raises(Exception, match="Failed to reconstruct the page") as err:
         pg3.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 7f86d92962..fa1bf0fbb2 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -37,7 +37,7 @@ def metrics_handler(request: Request) -> Response:
 
     checks = {
         "written_size": lambda value: value > 0,
-        "physical_size": lambda value: value >= 0,
+        "resident_size": lambda value: value >= 0,
         # >= 0 check here is to avoid race condition when we receive metrics before
         # remote_uploaded is updated
         "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
new file mode 100644
index 0000000000..352ae4b95c
--- /dev/null
+++ b/test_runner/regress/test_ondemand_download.py
@@ -0,0 +1,437 @@
+# It's possible to run any regular test with the local fs remote storage via
+# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
+
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    assert_tenant_status,
+    available_remote_storages,
+    wait_for_last_record_lsn,
+    wait_for_sk_commit_lsn_to_reach_remote_storage,
+    wait_for_upload,
+    wait_until,
+)
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar
+
+
+def get_num_downloaded_layers(client, tenant_id, timeline_id):
+    value = client.get_metric_value(
+        f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}'
+    )
+    if value is None:
+        return 0
+    return int(value)
+
+
+#
+# If you have a large relation, check that the pageserver downloads parts of it as
+# require by queries.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_ondemand_download_large_rel(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_large_rel",
+    )
+
+    ##### First start, insert secret data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable background GC
+            "gc_period": "10 m",
+            "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            "compaction_threshold": "3",
+            "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    # We want to make sure that the data is large enough that the keyspace is partitioned.
+    num_rows = 1000000
+
+    with pg.cursor() as cur:
+        # data loading may take a while, so increase statement timeout
+        cur.execute("SET statement_timeout='300s'")
+        cur.execute(
+            f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g
+        from generate_series(1,{num_rows}) g"""
+        )
+        cur.execute("CREATE INDEX ON tbl (id)")
+        cur.execute("VACUUM tbl")
+
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+
+    # run checkpoint manually to be sure that data landed in remote storage
+    client.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
+    log.info("uploads have finished")
+
+    ##### Stop the first pageserver instance, erase all its data
+    pg.stop()
+    env.pageserver.stop()
+
+    # remove all the layer files
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start()
+
+    pg.start()
+    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+
+    # Probe in the middle of the table. There's a high chance that the beginning
+    # and end of the table was stored together in the same layer files with data
+    # from other tables, and with the entry that stores the size of the
+    # relation, so they are likely already downloaded. But the middle of the
+    # table should not have been needed by anything yet.
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1
+
+    after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    log.info(f"layers downloaded before {before_downloads} and after {after_downloads}")
+    assert after_downloads > before_downloads
+
+
+#
+# If you have a relation with a long history of updates,the pageserver downloads the layer
+# files containing the history as needed by timetravel queries.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_ondemand_download_timetravel(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_timetravel",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # Disable background GC & compaction
+            # We don't want GC, that would break the assertion about num downloads.
+            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{1 * 1024 ** 2}",  # 1 MB
+            "compaction_threshold": "1",
+            "image_creation_threshold": "1",
+            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    lsns = []
+
+    table_len = 10000
+    with pg.cursor() as cur:
+        cur.execute(
+            f"""
+        CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
+        INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len});
+        """
+        )
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+    # run checkpoint manually to be sure that data landed in remote storage
+    client.timeline_checkpoint(tenant_id, timeline_id)
+    lsns.append((0, current_lsn))
+
+    for checkpoint_number in range(1, 20):
+        with pg.cursor() as cur:
+            cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+        lsns.append((checkpoint_number, current_lsn))
+
+        # wait until pageserver receives that data
+        wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
+
+        # run checkpoint manually to be sure that data landed in remote storage
+        client.timeline_checkpoint(tenant_id, timeline_id)
+
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
+    log.info("uploads have finished")
+
+    ##### Stop the first pageserver instance, erase all its data
+    env.postgres.stop_all()
+
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
+
+    def get_api_current_physical_size():
+        d = client.timeline_detail(tenant_id, timeline_id)
+        return d["current_physical_size"]
+
+    def get_resident_physical_size():
+        return client.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_resident_physical_size"
+        )
+
+    filled_current_physical = get_api_current_physical_size()
+    log.info(filled_current_physical)
+    filled_size = get_resident_physical_size()
+    log.info(filled_size)
+    assert filled_current_physical == filled_size, "we don't yet do layer eviction"
+
+    env.pageserver.stop()
+
+    # remove all the layer files
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start()
+
+    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+
+    # current_physical_size reports sum of layer file sizes, regardless of local or remote
+    assert filled_current_physical == get_api_current_physical_size()
+
+    num_layers_downloaded = [0]
+    physical_size = [get_resident_physical_size()]
+    for (checkpoint_number, lsn) in lsns:
+        pg_old = env.postgres.create_start(
+            branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn
+        )
+        with pg_old.cursor() as cur:
+            # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000
+            assert (
+                query_scalar(
+                    cur,
+                    f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}",
+                )
+                == 0
+            )
+            assert (
+                query_scalar(
+                    cur,
+                    f"select count(*) from testtab where checkpoint_number={checkpoint_number}",
+                )
+                == table_len
+            )
+
+        after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+        num_layers_downloaded.append(after_downloads)
+        log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}")
+
+        # Check that on each query, we need to download at least one more layer file. However in
+        # practice, thanks to compaction and the fact that some requests need to download
+        # more history, some points-in-time are covered by earlier downloads already. But
+        # in broad strokes, as we query more points-in-time, more layers need to be downloaded.
+        #
+        # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded
+        # more files than we had three iterations ago.
+        log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}")
+        if len(num_layers_downloaded) > 4:
+            assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4]
+
+        # Likewise, assert that the physical_size metric grows as layers are downloaded
+        physical_size.append(get_resident_physical_size())
+        log.info(f"physical_size[-1]={physical_size[-1]}")
+        if len(physical_size) > 4:
+            assert physical_size[-1] > physical_size[len(physical_size) - 4]
+
+        # current_physical_size reports sum of layer file sizes, regardless of local or remote
+        assert filled_current_physical == get_api_current_physical_size()
+
+
+#
+# Ensure that the `download_remote_layers` API works
+#
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_download_remote_layers_api(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_download_remote_layers_api",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    # Override defaults, to create more layers
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            # Disable background GC & compaction
+            # We don't want GC, that would break the assertion about num downloads.
+            # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # small checkpoint distance to create more delta layer files
+            "checkpoint_distance": f"{1 * 1024 ** 2}",  # 1 MB
+            "compaction_threshold": "1",
+            "image_creation_threshold": "1",
+            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
+        }
+    )
+    env.initial_tenant = tenant
+
+    pg = env.postgres.create_start("main")
+
+    client = env.pageserver.http_client()
+
+    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = pg.safe_psql("show neon.timeline_id")[0][0]
+
+    table_len = 10000
+    with pg.cursor() as cur:
+        cur.execute(
+            f"""
+        CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text);
+        INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len});
+        """
+        )
+
+    env.postgres.stop_all()
+
+    wait_for_sk_commit_lsn_to_reach_remote_storage(
+        tenant_id, timeline_id, env.safekeepers, env.pageserver
+    )
+
+    def get_api_current_physical_size():
+        d = client.timeline_detail(tenant_id, timeline_id)
+        return d["current_physical_size"]
+
+    def get_resident_physical_size():
+        return client.get_timeline_metric(
+            tenant_id, timeline_id, "pageserver_resident_physical_size"
+        )
+
+    filled_current_physical = get_api_current_physical_size()
+    log.info(filled_current_physical)
+    filled_size = get_resident_physical_size()
+    log.info(filled_size)
+    assert filled_current_physical == filled_size, "we don't yet do layer eviction"
+
+    env.pageserver.stop()
+
+    # remove all the layer files
+    # XXX only delete some of the layer files, to show that it really just downloads all the layers
+    for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
+        log.info(f"unlinking layer {layer}")
+        layer.unlink()
+
+    # Shut down safekeepers before starting the pageserver.
+    # If we don't, the tenant's walreceiver handler will trigger the
+    # the logical size computation task, and that downloads layes,
+    # which makes our assertions on size fail.
+    for sk in env.safekeepers:
+        sk.stop(immediate=True)
+
+    ##### Second start, restore the data and ensure it's the same
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
+    env.pageserver.allowed_errors.extend(
+        [
+            f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint",
+            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
+        ]
+    )
+
+    wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
+
+    ###### Phase 1: exercise download error code path
+    assert (
+        filled_current_physical == get_api_current_physical_size()
+    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
+    post_unlink_size = get_resident_physical_size()
+    log.info(post_unlink_size)
+    assert (
+        post_unlink_size < filled_size
+    ), "we just deleted layers and didn't cause anything to re-download them yet"
+    assert filled_size - post_unlink_size > 5 * (
+        1024**2
+    ), "we may be downloading some layers as part of tenant activation"
+
+    # issue downloads that we know will fail
+    info = client.timeline_download_remote_layers(
+        tenant_id, timeline_id, errors_ok=True, at_least_one_download=False
+    )
+    log.info(f"info={info}")
+    assert info["state"] == "Completed"
+    assert info["total_layer_count"] > 0
+    assert info["successful_download_count"] == 0
+    assert (
+        info["failed_download_count"] > 0
+    )  # can't assert == total_layer_count because attach + tenant status downloads some layers
+    assert (
+        info["total_layer_count"]
+        == info["successful_download_count"] + info["failed_download_count"]
+    )
+    assert get_api_current_physical_size() == filled_current_physical
+    assert (
+        get_resident_physical_size() == post_unlink_size
+    ), "didn't download anything new due to failpoint"
+    # would be nice to assert that the layers in the layer map are still RemoteLayer
+
+    ##### Retry, this time without failpoints
+    client.configure_failpoints(("remote-storage-download-pre-rename", "off"))
+    info = client.timeline_download_remote_layers(tenant_id, timeline_id, errors_ok=False)
+    log.info(f"info={info}")
+
+    assert info["state"] == "Completed"
+    assert info["total_layer_count"] > 0
+    assert info["successful_download_count"] > 0
+    assert info["failed_download_count"] == 0
+    assert (
+        info["total_layer_count"]
+        == info["successful_download_count"] + info["failed_download_count"]
+    )
+
+    refilled_size = get_resident_physical_size()
+    log.info(refilled_size)
+
+    assert filled_size == refilled_size, "we redownloaded all the layers"
+    assert get_api_current_physical_size() == filled_current_physical
+
+    for sk in env.safekeepers:
+        sk.start()
+
+    # ensure that all the data is back
+    pg_old = env.postgres.create_start(branch_name="main")
+    with pg_old.cursor() as cur:
+        assert query_scalar(cur, "select count(*) from testtab") == table_len
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 94e483cdb5..32c25b2e8c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PageserverApiException,
     RemoteStorageKind,
-    assert_no_in_progress_downloads_for_tenant,
     available_remote_storages,
     wait_for_last_flush_lsn,
     wait_for_last_record_lsn,
@@ -62,9 +61,9 @@ def test_remote_storage_backup_and_restore(
     neon_env_builder.pageserver_config_override = "test_remote_failures=1"
 
     data_id = 1
-    data_secret = "very secret secret"
+    data = "just some data"
 
-    ##### First start, insert secret data and upload it to the remote storage
+    ##### First start, insert data and upload it to the remote storage
     env = neon_env_builder.init_start()
 
     # FIXME: Is this expected?
@@ -97,8 +96,8 @@ def test_remote_storage_backup_and_restore(
         with pg.cursor() as cur:
             cur.execute(
                 f"""
-                CREATE TABLE t{checkpoint_number}(id int primary key, secret text);
-                INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}');
+                CREATE TABLE t{checkpoint_number}(id int primary key, data text);
+                INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data}|{checkpoint_number}');
             """
             )
             current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
@@ -133,36 +132,53 @@ def test_remote_storage_backup_and_restore(
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    # Introduce failpoint in download
-    pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return"))
-
+    # Introduce failpoint in list remote timelines code path to make tenant_attach fail.
+    # This is before the failures injected by test_remote_failures, so it's a permanent error.
+    pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
+    env.pageserver.allowed_errors.append(
+        ".*error attaching tenant: storage-sync-list-remote-timelines",
+    )
+    # Attach it. This HTTP request will succeed and launch a
+    # background task to load the tenant. In that background task,
+    # listing the remote timelines will fail because of the failpoint,
+    # and the tenant will be marked as Broken.
     client.tenant_attach(tenant_id)
-
-    # is there a better way to assert that failpoint triggered?
     wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
 
-    # assert cannot attach timeline that is scheduled for download
-    # FIXME implement layer download retries
+    # Ensure that even though the tenant is broken, we can't attach it again.
     with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
         client.tenant_attach(tenant_id)
 
-    tenant_status = client.tenant_status(tenant_id)
-    log.info("Tenant status with active failpoint: %s", tenant_status)
-    # FIXME implement layer download retries
-    # assert tenant_status["has_in_progress_downloads"] is True
-
-    # trigger temporary download files removal
+    # Restart again, this implicitly clears the failpoint.
+    # test_remote_failures=1 remains active, though, as it's in the pageserver config.
+    # This means that any of the remote client operations after restart will exercise the
+    # retry code path.
+    #
+    # The initiated attach operation should survive the restart, and continue from where it was.
     env.pageserver.stop()
+    layer_download_failed_regex = (
+        r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
+    )
+    assert not env.pageserver.log_contains(
+        layer_download_failed_regex
+    ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
     env.pageserver.start()
 
-    # ensure that an initiated attach operation survives pageserver restart
+    # Ensure that the pageserver remembers that the tenant was attaching, by
+    # trying to attach it again. It should fail.
     with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
         client.tenant_attach(tenant_id)
-    log.info("waiting for timeline redownload")
+    log.info("waiting for tenant to become active. this should be quick with on-demand download")
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
     wait_until(
-        number_of_iterations=20,
+        number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=tenant_active,
     )
 
     detail = client.timeline_detail(tenant_id, timeline_id)
@@ -171,14 +187,18 @@ def test_remote_storage_backup_and_restore(
         Lsn(detail["last_record_lsn"]) >= current_lsn
     ), "current db Lsn should should not be less than the one stored on remote storage"
 
+    log.info("select some data, this will cause layers to be downloaded")
     pg = env.postgres.create_start("main")
     with pg.cursor() as cur:
         for checkpoint_number in checkpoint_numbers:
             assert (
-                query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};")
-                == f"{data_secret}|{checkpoint_number}"
+                query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};")
+                == f"{data}|{checkpoint_number}"
             )
 
+    log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
+    assert env.pageserver.log_contains(layer_download_failed_regex)
+
 
 # Exercises the upload queue retry code paths.
 # - Use failpoints to cause all storage ops to fail
@@ -338,7 +358,6 @@ def test_remote_storage_upload_queue_retries(
     def tenant_active():
         all_states = client.tenant_list()
         [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
-        assert tenant["has_in_progress_downloads"] is False
         assert tenant["state"] == "Active"
 
     wait_until(30, 1, tenant_active)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 081fd0fc2f..1b58937e2a 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -13,12 +13,15 @@ from fixtures.neon_fixtures import (
     PageserverHttpClient,
     PortDistributor,
     Postgres,
-    assert_no_in_progress_downloads_for_tenant,
+    assert_tenant_status,
+    tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until,
+    wait_while,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until
+from fixtures.utils import query_scalar, start_in_background, subprocess_capture
 
 
 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
@@ -406,17 +409,13 @@ def test_tenant_relocation(
             # call to attach timeline to new pageserver
             new_pageserver_http.tenant_attach(tenant_id)
 
-            # check that it shows that download is in progress
+            # wait for tenant to finish attaching
             tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
-            assert tenant_status.get("has_in_progress_downloads"), tenant_status
-
-            # wait until tenant is downloaded
+            assert tenant_status["state"] in ["Attaching", "Active"]
             wait_until(
                 number_of_iterations=10,
                 interval=1,
-                func=lambda: assert_no_in_progress_downloads_for_tenant(
-                    new_pageserver_http, tenant_id
-                ),
+                func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"),
             )
 
             check_timeline_attached(
@@ -459,9 +458,15 @@ def test_tenant_relocation(
 
         # detach tenant from old pageserver before we check
         # that all the data is there to be sure that old pageserver
-        # is no longer involved, and if it is, we will see the errors
+        # is no longer involved, and if it is, we will see the error
         pageserver_http.tenant_detach(tenant_id)
 
+        # Wait a little, so that the detach operation has time to finish.
+        wait_while(
+            number_of_iterations=100,
+            interval=1,
+            func=lambda: tenant_exists(pageserver_http, tenant_id),
+        )
         post_migration_check(pg_main, 500500, old_local_path_main)
         post_migration_check(pg_second, 1001000, old_local_path_second)
 
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index ddae1a67ff..4eba4ce942 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -20,44 +20,48 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
         matching = [t for t in all_states if TenantId(t["id"]) == tenant]
         return get_only_element(matching)["state"]
 
-    def get_metric_value(name):
-        metrics = client.get_metrics()
-        relevant = [line for line in metrics.splitlines() if line.startswith(name)]
-        if len(relevant) == 0:
-            return 0
-        line = get_only_element(relevant)
-        value = line.lstrip(name).strip()
-        return int(value)
-
     def delete_all_timelines(tenant: TenantId):
         timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)]
         for t in timelines:
             client.timeline_delete(tenant, t)
 
+    def assert_active(tenant):
+        assert get_state(tenant) == "Active"
+
     # Create tenant, start compute
     tenant, _ = env.neon_cli.create_tenant()
     env.neon_cli.create_timeline(name, tenant_id=tenant)
     pg = env.postgres.create_start(name, tenant_id=tenant)
+    assert (
+        get_state(tenant) == "Active"
+    ), "Pageserver should activate a tenant and start background jobs if timelines are loaded"
 
     # Stop compute
     pg.stop()
 
-    # Delete all timelines on all tenants
+    # Delete all timelines on all tenants.
+    #
+    # FIXME: we used to check that the background jobs are stopped when all timelines
+    # are removed, but we don't stop them anymore. Not sure if this test still makes sense
+    # or we should just remove it.
     for tenant_info in client.tenant_list():
         tenant_id = TenantId(tenant_info["id"])
         delete_all_timelines(tenant_id)
+        wait_until(10, 0.2, lambda: assert_active(tenant_id))
 
     # Assert that all tasks finish quickly after tenant is detached
-    assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0
+    task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
+    assert task_starts is not None
+    assert int(task_starts) > 0
     client.tenant_detach(tenant)
     client.tenant_detach(env.initial_tenant)
 
     def assert_tasks_finish():
-        tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}')
-        tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}')
-        tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}')
+        tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
+        tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}')
+        tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}')
         log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}")
         assert tasks_started == tasks_ended
-        assert tasks_panicked == 0
+        assert tasks_panicked is None or int(tasks_panicked) == 0
 
     wait_until(10, 0.2, assert_tasks_finish)
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 4cd74e17e9..6a5b4278da 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     Postgres,
     RemoteStorageKind,
-    assert_no_in_progress_downloads_for_tenant,
+    assert_tenant_status,
     available_remote_storages,
     wait_for_last_record_lsn,
     wait_for_sk_commit_lsn_to_reach_remote_storage,
@@ -179,14 +179,6 @@ def test_tenants_attached_after_download(
         tenant_id, timeline_id, env.safekeepers, env.pageserver
     )
 
-    detail_before = client.timeline_detail(
-        tenant_id, timeline_id, include_non_incremental_physical_size=True
-    )
-    assert (
-        detail_before["current_physical_size_non_incremental"]
-        == detail_before["current_physical_size"]
-    )
-
     env.pageserver.stop()
 
     timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
@@ -200,13 +192,16 @@ def test_tenants_attached_after_download(
     assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}"
 
     ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory
+    # FIXME: just starting the pageserver no longer downloads the
+    # layer files. Do we want to force download, or maybe run some
+    # queries, or is it enough that it starts up without layer files?
     env.pageserver.start()
     client = env.pageserver.http_client()
 
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -218,12 +213,6 @@ def test_tenants_attached_after_download(
         timeline_id
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
-    # Check that the physical size matches after re-downloading
-    detail_after = client.timeline_detail(
-        tenant_id, timeline_id, include_non_incremental_physical_size=True
-    )
-    assert detail_before["current_physical_size"] == detail_after["current_physical_size"]
-
     # Check that we had to retry the downloads
     assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*")
 
@@ -297,7 +286,7 @@ def test_tenant_upgrades_index_json_from_v0(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
+        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
     )
 
     pg = env.postgres.create_start("main")
@@ -404,7 +393,7 @@ def test_tenant_ignores_backup_file(
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id),
+        func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"),
     )
 
     pg = env.postgres.create_start("main")
@@ -484,14 +473,15 @@ def test_tenant_redownloads_truncated_file_on_startup(
     index_part = local_fs_index_part(env, tenant_id, timeline_id)
     assert index_part["layer_metadata"][path.name]["file_size"] == expected_size
 
-    ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory
+    ## Start the pageserver. It will notice that the file size doesn't match, and
+    ## rename away the local file. It will be re-downloaded when it's needed.
     env.pageserver.start()
     client = env.pageserver.http_client()
 
     wait_until(
         number_of_iterations=5,
         interval=1,
-        func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id),
+        func=lambda: assert_tenant_status(client, tenant_id, "Active"),
     )
 
     restored_timelines = client.timeline_list(tenant_id)
@@ -503,6 +493,10 @@ def test_tenant_redownloads_truncated_file_on_startup(
         timeline_id
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
+    # Request non-incremental logical size. Calculating it needs the layer file that
+    # we corrupted, forcing it to be redownloaded.
+    client.timeline_detail(tenant_id, timeline_id, include_non_incremental_logical_size=True)
+
     assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"
 
     # the remote side of local_layer_truncated
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 523c946a68..3b41cc5c90 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,10 +20,12 @@ from fixtures.neon_fixtures import (
     PortDistributor,
     Postgres,
     VanillaPostgres,
+    assert_tenant_status,
     wait_for_last_flush_lsn,
+    wait_until,
 )
 from fixtures.types import TenantId, TimelineId
-from fixtures.utils import get_timeline_dir_size, wait_until
+from fixtures.utils import get_timeline_dir_size
 
 
 def test_timeline_size(neon_simple_env: NeonEnv):
@@ -320,7 +322,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
     env.pageserver.stop()
     env.pageserver.start()
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    # Wait for the tenant to be loaded
+    client = env.pageserver.http_client()
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"),
+    )
+
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
@@ -341,7 +353,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
     wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
@@ -376,7 +390,9 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
@@ -415,7 +431,9 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
     pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
     pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
 
-    assert_physical_size(env, env.initial_tenant, new_timeline_id)
+    assert_physical_size_invariants(
+        get_physical_size_values(env, env.initial_tenant, new_timeline_id)
+    )
 
 
 # The timeline logical and physical sizes are also exposed as prometheus metrics.
@@ -448,7 +466,7 @@ def test_timeline_size_metrics(
     # get the metrics and parse the metric for the current timeline's physical size
     metrics = env.pageserver.http_client().get_metrics()
     matches = re.search(
-        f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
+        f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
         metrics,
         re.MULTILINE,
     )
@@ -507,11 +525,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
 
     tenant, timeline = env.neon_cli.create_tenant()
 
-    def get_timeline_physical_size(timeline: TimelineId):
-        res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True)
-        return res["current_physical_size_non_incremental"]
+    def get_timeline_resident_physical_size(timeline: TimelineId):
+        sizes = get_physical_size_values(env, tenant, timeline)
+        assert_physical_size_invariants(sizes)
+        return sizes.prometheus_resident_physical
 
-    timeline_total_size = get_timeline_physical_size(timeline)
+    timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
     for i in range(10):
         n_rows = random.randint(100, 1000)
 
@@ -528,22 +547,54 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
         wait_for_last_flush_lsn(env, pg, tenant, timeline)
         pageserver_http.timeline_checkpoint(tenant, timeline)
 
-        timeline_total_size += get_timeline_physical_size(timeline)
+        timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
 
         pg.stop()
 
-    tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"])
-    assert tenant_physical_size == timeline_total_size
+    # ensure that tenant_status current_physical size reports sum of timeline current_physical_size
+    tenant_current_physical_size = int(
+        client.tenant_status(tenant_id=tenant)["current_physical_size"]
+    )
+    assert tenant_current_physical_size == sum(
+        [tl["current_physical_size"] for tl in client.timeline_list(tenant_id=tenant)]
+    )
+    # since we don't do layer eviction, current_physical_size is identical to resident physical size
+    assert timeline_total_resident_physical_size == tenant_current_physical_size
 
 
-def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
-    """Check the current physical size returned from timeline API
-    matches the total physical size of the timeline on disk"""
+class TimelinePhysicalSizeValues:
+    api_current_physical: int
+    prometheus_resident_physical: int
+    python_timelinedir_layerfiles_physical: int
+
+
+def get_physical_size_values(
+    env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId
+) -> TimelinePhysicalSizeValues:
+    res = TimelinePhysicalSizeValues()
+
     client = env.pageserver.http_client()
-    res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True)
+
+    res.prometheus_resident_physical = client.get_timeline_metric(
+        tenant_id, timeline_id, "pageserver_resident_physical_size"
+    )
+
+    detail = client.timeline_detail(
+        tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
+    )
+    res.api_current_physical = detail["current_physical_size"]
+
     timeline_path = env.timeline_dir(tenant_id, timeline_id)
-    assert res["current_physical_size"] == res["current_physical_size_non_incremental"]
-    assert res["current_physical_size"] == get_timeline_dir_size(timeline_path)
+    res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path)
+
+    return res
+
+
+def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
+    # resident phyiscal size is defined as
+    assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
+    # we don't do layer eviction, so, all layers are resident
+    assert sizes.api_current_physical == sizes.prometheus_resident_physical
 
 
 # Timeline logical size initialization is an asynchronous background task that runs once,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index d88ed319b5..77ec33f8b0 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -585,17 +585,23 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
         if elapsed > wait_lsn_timeout:
             raise RuntimeError("Timed out waiting for WAL redo")
 
-        pageserver_lsn = Lsn(
-            env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"]
-        )
-        lag = last_lsn - pageserver_lsn
+        tenant_status = ps_cli.tenant_status(tenant_id)
+        if tenant_status["state"] == "Loading":
+            log.debug(f"Tenant {tenant_id} is still loading, retrying")
+        else:
+            pageserver_lsn = Lsn(
+                env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[
+                    "last_record_lsn"
+                ]
+            )
+            lag = last_lsn - pageserver_lsn
 
-        if time.time() > last_debug_print + 10 or lag <= 0:
-            last_debug_print = time.time()
-            log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb")
+            if time.time() > last_debug_print + 10 or lag <= 0:
+                last_debug_print = time.time()
+                log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb")
 
-        if lag <= 0:
-            break
+                if lag <= 0:
+                    break
 
         time.sleep(1)
 

From f5f1197e15cc68bbf47ef91653b50b60d99ec7eb Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 11:25:56 +0100
Subject: [PATCH 056/132] Build vm-compute-node images (#3174)

---
 .github/workflows/build_and_test.yml | 64 ++++++++++++++++++++--------
 Dockerfile.compute-node-v14          |  3 --
 Dockerfile.compute-node-v15          |  3 --
 3 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 43b855a2b0..6443a56afc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -555,10 +555,14 @@ jobs:
       - name: Kaniko build compute tools
         run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
 
-  compute-node-image-v14:
+  compute-node-image:
     runs-on: [ self-hosted, dev, x64 ]
     container: gcr.io/kaniko-project/executor:v1.9.0-debug
     needs: [ tag ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
     defaults:
       run:
         shell: sh -eu {0}
@@ -573,32 +577,40 @@ jobs:
       - name: Configure ECR login
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
-      - name: Kaniko build compute node with extensions v14
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
+      - name: Kaniko build compute node with extensions
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-  compute-node-image-v15:
+  vm-compute-node-image:
     runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
-    needs: [ tag ]
+    needs: [ tag, compute-node-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15 ]
     defaults:
       run:
         shell: sh -eu {0}
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-        with:
-          submodules: true
-          fetch-depth: 0
+      - name: Downloading latest vm-builder
+        run: |
+          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          chmod +x vm-builder
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Pulling compute-node image
+        run: |
+          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-      - name: Kaniko build compute node with extensions v15
-        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
+      - name: Build vm image
+        run: |
+          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Pushing vm-compute-node image
+        run: |
+          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
   test-images:
-    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
+    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
     runs-on: [ self-hosted, dev, x64 ]
 
     steps:
@@ -642,13 +654,13 @@ jobs:
 
   promote-images:
     runs-on: [ self-hosted, dev, x64 ]
-    needs: [ tag, test-images ]
+    needs: [ tag, test-images, vm-compute-node-image ]
     if: github.event_name != 'workflow_dispatch'
     container: amazon/aws-cli
     strategy:
       fail-fast: false
       matrix:
-        name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
+        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
 
     steps:
       - name: Promote image to latest
@@ -681,9 +693,15 @@ jobs:
       - name: Pull compute node v14 image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
 
+      - name: Pull vm compute node v14 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+
       - name: Pull compute node v15 image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
 
+      - name: Pull vm compute node v15 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+
       - name: Pull rust image from ECR
         run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
 
@@ -695,7 +713,9 @@ jobs:
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
 
       - name: Configure Docker Hub login
         run: |
@@ -712,9 +732,15 @@ jobs:
       - name: Push compute node v14 image to Docker Hub
         run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
 
+      - name: Push vm compute node v14 image to Docker Hub
+        run: crane push compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+
       - name: Push compute node v15 image to Docker Hub
         run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
 
+      - name: Push vm compute node v15 image to Docker Hub
+        run: crane push compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+
       - name: Push rust image to Docker Hub
         run: crane push rust neondatabase/rust:pinned
 
@@ -726,7 +752,9 @@ jobs:
           crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
   calculate-deploy-targets:
     runs-on: [ self-hosted, dev, x64 ]
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index ad036338a0..1ffabafd51 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include
 
-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index 4526644421..11cefcc2da 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto
 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include
 
-# Remove now-useless PGXS src infrastructure
-RUN rm -r /usr/local/pgsql/lib/pgxs/src
-
 # Remove static postgresql libraries - all compilation is finished, so we
 # can now remove these files - they must be included in other binaries by now
 # if they were to be used by other libraries.

From fca25edae8aea52c764093d0f7677c11f66a7609 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 22 Dec 2022 14:27:48 +0200
Subject: [PATCH 057/132] Fix 1.66 Clippy warnings (#3178)

1.66 release speeds up compile times for over 10% according to tests.

Also its Clippy finds plenty of old nits in our code:
* useless conversion, `foo as u8` where `foo: u8` and similar, removed
`as u8` and similar
* useless references and dereferenced (that were automatically adjusted
by the compiler), removed various `&` and `*`
* bool -> u8 conversion via `if/else`, changed to `u8::from`
* Map `.iter()` calls where only values were used, changed to
`.values()` instead

Standing out lints:
* `Eq` is missing in our protoc generated structs. Silenced, does not
seem crucial for us.
* `fn default` looks like the one from `Default` trait, so I've
implemented that instead and replaced the `dummy_*` method in tests with
`::default()` invocation
* Clippy detected that
```
if retry_attempt < u32::MAX {
    retry_attempt += 1;
}
```
is a saturating add and proposed to replace it.
---
 compute_tools/src/compute.rs                  |  4 +--
 control_plane/src/bin/neon_local.rs           |  4 +--
 control_plane/src/broker.rs                   |  2 +-
 control_plane/src/compute.rs                  | 14 ++++----
 control_plane/src/local_env.rs                | 10 +++---
 control_plane/src/pageserver.rs               |  2 +-
 libs/pageserver_api/src/models.rs             |  8 ++---
 libs/postgres_ffi/src/nonrelfile_utils.rs     | 10 +++---
 libs/postgres_ffi/src/xlog_utils.rs           |  4 +--
 libs/postgres_ffi/wal_craft/src/lib.rs        |  8 ++---
 libs/pq_proto/src/lib.rs                      |  2 +-
 libs/utils/src/crashsafe.rs                   | 12 +++----
 libs/utils/src/sock_split.rs                  |  2 +-
 pageserver/benches/bench_walredo.rs           |  2 +-
 pageserver/src/basebackup.rs                  |  2 +-
 pageserver/src/bin/pageserver.rs              |  4 +--
 pageserver/src/bin/pageserver_binutils.rs     |  6 ++--
 pageserver/src/config.rs                      |  2 +-
 pageserver/src/import_datadir.rs              |  2 +-
 pageserver/src/page_service.rs                |  6 ++--
 pageserver/src/pgdatadir_mapping.rs           | 10 +++---
 pageserver/src/storage_sync2.rs               |  4 +--
 pageserver/src/tenant.rs                      | 24 +++++++-------
 pageserver/src/tenant/disk_btree.rs           | 16 +++++-----
 pageserver/src/tenant/ephemeral_file.rs       |  2 +-
 pageserver/src/tenant/image_layer.rs          |  2 +-
 pageserver/src/tenant/metadata.rs             |  3 +-
 pageserver/src/tenant/timeline.rs             | 10 +++---
 pageserver/src/tenant_config.rs               | 32 ++-----------------
 pageserver/src/walingest.rs                   | 22 ++++++-------
 .../src/walreceiver/connection_manager.rs     |  2 +-
 pageserver/src/walredo.rs                     |  6 ++--
 proxy/src/scram/secret.rs                     |  4 +--
 safekeeper/src/control_file.rs                |  4 +--
 safekeeper/src/metrics.rs                     |  2 +-
 safekeeper/src/wal_backup.rs                  |  6 ++--
 safekeeper/src/wal_storage.rs                 | 14 ++++----
 storage_broker/benches/rps.rs                 |  2 +-
 storage_broker/src/lib.rs                     |  4 +++
 39 files changed, 123 insertions(+), 152 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index bfdd2340ec..eceff0fc4e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -175,7 +175,7 @@ impl ComputeNode {
         let start_time = Utc::now();
 
         let sync_handle = Command::new(&self.pgbin)
-            .args(&["--sync-safekeepers"])
+            .args(["--sync-safekeepers"])
             .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
             .stdout(Stdio::piped())
             .spawn()
@@ -253,7 +253,7 @@ impl ComputeNode {
 
         // Run postgres as a child process.
         let mut pg = Command::new(&self.pgbin)
-            .args(&["-D", &self.pgdata])
+            .args(["-D", &self.pgdata])
             .spawn()
             .expect("cannot start postgres process");
 
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 53fd3100c7..71de741640 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -549,7 +549,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
 
             table.load_preset(comfy_table::presets::NOTHING);
 
-            table.set_header(&[
+            table.set_header([
                 "NODE",
                 "ADDRESS",
                 "TIMELINE",
@@ -584,7 +584,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                     .map(|name| name.as_str())
                     .unwrap_or("?");
 
-                table.add_row(&[
+                table.add_row([
                     node_name.as_str(),
                     &node.address.to_string(),
                     &node.timeline_id.to_string(),
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index bd60580012..6c0604a076 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -17,7 +17,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
         "storage_broker",
         &env.base_data_dir,
         &env.storage_broker_bin(),
-        &args,
+        args,
         [],
         background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)),
         || {
diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs
index 0eec25c51e..547aa14d39 100644
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -44,7 +44,7 @@ impl ComputeControlPlane {
         let mut nodes = BTreeMap::default();
         let pgdatadirspath = &env.pg_data_dirs_path();
 
-        for tenant_dir in fs::read_dir(&pgdatadirspath)
+        for tenant_dir in fs::read_dir(pgdatadirspath)
             .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
         {
             let tenant_dir = tenant_dir?;
@@ -67,8 +67,8 @@ impl ComputeControlPlane {
     fn get_port(&mut self) -> u16 {
         1 + self
             .nodes
-            .iter()
-            .map(|(_name, node)| node.address.port())
+            .values()
+            .map(|node| node.address.port())
             .max()
             .unwrap_or(self.base_port)
     }
@@ -183,7 +183,7 @@ impl PostgresNode {
 
     fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
         let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
-        let mut cmd = Command::new(&pg_path);
+        let mut cmd = Command::new(pg_path);
 
         cmd.arg("--sync-safekeepers")
             .env_clear()
@@ -261,7 +261,7 @@ impl PostgresNode {
     }
 
     fn create_pgdata(&self) -> Result<()> {
-        fs::create_dir_all(&self.pgdata()).with_context(|| {
+        fs::create_dir_all(self.pgdata()).with_context(|| {
             format!(
                 "could not create data directory {}",
                 self.pgdata().display()
@@ -478,7 +478,7 @@ impl PostgresNode {
                 postgresql_conf_path.to_str().unwrap()
             )
         })?;
-        fs::remove_dir_all(&self.pgdata())?;
+        fs::remove_dir_all(self.pgdata())?;
         self.create_pgdata()?;
 
         // 2. Bring back config files
@@ -514,7 +514,7 @@ impl PostgresNode {
                 "Destroying postgres data directory '{}'",
                 self.pgdata().to_str().unwrap()
             );
-            fs::remove_dir_all(&self.pgdata())?;
+            fs::remove_dir_all(self.pgdata())?;
         } else {
             self.pg_ctl(&["stop"], &None)?;
         }
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index ed9e467eee..ea936640ec 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -404,7 +404,7 @@ impl LocalEnv {
             }
         }
 
-        fs::create_dir(&base_path)?;
+        fs::create_dir(base_path)?;
 
         // generate keys for jwt
         // openssl genrsa -out private_key.pem 2048
@@ -413,7 +413,7 @@ impl LocalEnv {
             private_key_path = base_path.join("auth_private_key.pem");
             let keygen_output = Command::new("openssl")
                 .arg("genrsa")
-                .args(&["-out", private_key_path.to_str().unwrap()])
+                .args(["-out", private_key_path.to_str().unwrap()])
                 .arg("2048")
                 .stdout(Stdio::null())
                 .output()
@@ -430,10 +430,10 @@ impl LocalEnv {
             // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
             let keygen_output = Command::new("openssl")
                 .arg("rsa")
-                .args(&["-in", private_key_path.to_str().unwrap()])
+                .args(["-in", private_key_path.to_str().unwrap()])
                 .arg("-pubout")
-                .args(&["-outform", "PEM"])
-                .args(&["-out", public_key_path.to_str().unwrap()])
+                .args(["-outform", "PEM"])
+                .args(["-out", public_key_path.to_str().unwrap()])
                 .stdout(Stdio::null())
                 .output()
                 .context("failed to generate auth private key")?;
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3575e75db9..0c2415965a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -241,7 +241,7 @@ impl PageServerNode {
         let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
         args.push(Cow::Borrowed("--init"));
 
-        let init_output = Command::new(&self.env.pageserver_bin())
+        let init_output = Command::new(self.env.pageserver_bin())
             .args(args.iter().map(Cow::as_ref))
             .envs(self.pageserver_env_variables()?)
             .output()
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 88603d9539..d954e5d21f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -323,7 +323,7 @@ impl PagestreamFeMessage {
         match self {
             Self::Exists(req) => {
                 bytes.put_u8(0);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -333,7 +333,7 @@ impl PagestreamFeMessage {
 
             Self::Nblocks(req) => {
                 bytes.put_u8(1);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -343,7 +343,7 @@ impl PagestreamFeMessage {
 
             Self::GetPage(req) => {
                 bytes.put_u8(2);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.rel.spcnode);
                 bytes.put_u32(req.rel.dbnode);
@@ -354,7 +354,7 @@ impl PagestreamFeMessage {
 
             Self::DbSize(req) => {
                 bytes.put_u8(3);
-                bytes.put_u8(if req.latest { 1 } else { 0 });
+                bytes.put_u8(u8::from(req.latest));
                 bytes.put_u64(req.lsn.0);
                 bytes.put_u32(req.dbnode);
             }
diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs
index 01e5554b8a..5acf90be70 100644
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -14,8 +14,8 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
         status
     );
 
-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
 
     let bshift: u8 =
         ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
@@ -25,13 +25,13 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
 }
 
 pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
-    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+    let byteno: usize =
+        ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
 
     let bshift: u8 =
         ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
 
-    ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
+    (page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK
 }
 
 // See CLOGPagePrecedes in clog.c
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 953723a8f0..272c4d6dcc 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -333,7 +333,7 @@ impl CheckPoint {
 // We need this segment to start compute node.
 //
 pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
-    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize);
+    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);
 
     let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
     let hdr = XLogLongPageHeaderData {
@@ -574,7 +574,7 @@ mod tests {
 
         // Rename file to partial to actually find last valid lsn, then rename it back.
         fs::rename(
-            cfg.wal_dir().join(&last_segment),
+            cfg.wal_dir().join(last_segment),
             cfg.wal_dir().join(format!("{}.partial", last_segment)),
         )
         .unwrap();
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index feec3b2ace..969befc8e7 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -81,7 +81,7 @@ impl Conf {
             .new_pg_command("initdb")?
             .arg("-D")
             .arg(self.datadir.as_os_str())
-            .args(&["-U", "postgres", "--no-instructions", "--no-sync"])
+            .args(["-U", "postgres", "--no-instructions", "--no-sync"])
             .output()?;
         debug!("initdb output: {:?}", output);
         ensure!(
@@ -105,12 +105,12 @@ impl Conf {
         let unix_socket_dir_path = unix_socket_dir.path().to_owned();
         let server_process = self
             .new_pg_command("postgres")?
-            .args(&["-c", "listen_addresses="])
+            .args(["-c", "listen_addresses="])
             .arg("-k")
             .arg(unix_socket_dir_path.as_os_str())
             .arg("-D")
             .arg(self.datadir.as_os_str())
-            .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
+            .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output
             .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
             .stderr(Stdio::from(log_file))
             .spawn()?;
@@ -142,7 +142,7 @@ impl Conf {
         );
         let output = self
             .new_pg_command("pg_waldump")?
-            .args(&[
+            .args([
                 &first_segment_file.as_os_str(),
                 &last_segment_file.as_os_str(),
             ])
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 0d698127b9..278f044c15 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -881,7 +881,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_u8(b'k');
                     buf.put_u64(req.sent_ptr);
                     buf.put_i64(req.timestamp);
-                    buf.put_u8(if req.request_reply { 1 } else { 0 });
+                    buf.put_u8(u8::from(req.request_reply));
                 });
             }
         }
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 3726779cb2..2c7e6e20ab 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -157,34 +157,34 @@ mod tests {
         assert_eq!(err.kind(), io::ErrorKind::AlreadyExists);
 
         let invalid_dir_path = file_path.join("folder");
-        create_dir_all(&invalid_dir_path).unwrap_err();
+        create_dir_all(invalid_dir_path).unwrap_err();
     }
 
     #[test]
     fn test_path_with_suffix_extension() {
         let p = PathBuf::from("/foo/bar");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp").to_string_lossy(),
             "/foo/bar.temp"
         );
         let p = PathBuf::from("/foo/bar");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
             "/foo/bar.temp.temp"
         );
         let p = PathBuf::from("/foo/bar.baz");
         assert_eq!(
-            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
             "/foo/bar.baz.temp.temp"
         );
         let p = PathBuf::from("/foo/bar.baz");
         assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
             "/foo/bar.baz..temp"
         );
         let p = PathBuf::from("/foo/bar/dir/");
         assert_eq!(
-            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
             "/foo/bar/dir..temp"
         );
     }
diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs
index 5e4598daf1..b0e5a0bf6a 100644
--- a/libs/utils/src/sock_split.rs
+++ b/libs/utils/src/sock_split.rs
@@ -50,7 +50,7 @@ impl BufStream {
 
     /// Returns a reference to the underlying TcpStream.
     fn get_ref(&self) -> &TcpStream {
-        &*self.0.get_ref().0
+        &self.0.get_ref().0
     }
 }
 
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 8f53fce027..61011c9f36 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -84,7 +84,7 @@ fn add_multithreaded_walredo_requesters(
 
                             barrier.wait();
 
-                            execute_all(input, &*manager).unwrap();
+                            execute_all(input, &manager).unwrap();
 
                             barrier.wait();
                         }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index aa87865a8a..36664e119e 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -131,7 +131,7 @@ where
 
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
-            let header = new_tar_header_dir(*dir)?;
+            let header = new_tar_header_dir(dir)?;
             self.ar.append(&header, &mut io::empty())?;
         }
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index cc403ec2ea..e72a861be0 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,7 +126,7 @@ fn initialize_config(
             );
         }
         // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
+        let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
             format!(
                 "Failed to read pageserver config at '{}'",
                 cfg_file_path.display()
@@ -180,7 +180,7 @@ fn initialize_config(
     if update_config {
         info!("Writing pageserver config to '{}'", cfg_file_path.display());
 
-        std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
+        std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
             format!(
                 "Failed to write pageserver config to '{}'",
                 cfg_file_path.display()
diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs
index b1484ac45a..9da173c873 100644
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
 }
 
 fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
     println!("{control_file:?}");
     let control_file_initdb = Lsn(control_file.checkPoint);
     println!(
@@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> {
 }
 
 fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(&path)?;
+    let metadata_bytes = std::fs::read(path)?;
     let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
     println!("Current metadata:\n{meta:?}");
     let mut update_meta = false;
@@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
 
     if update_meta {
         let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(&path, &metadata_bytes)?;
+        std::fs::write(path, metadata_bytes)?;
     }
 
     Ok(())
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index c6f417390f..9334f88a7e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -722,7 +722,7 @@ impl PageServerConf {
             auth_validation_public_key_path: None,
             remote_storage_config: None,
             profiling: ProfilingConfig::Disabled,
-            default_tenant_conf: TenantConf::dummy_conf(),
+            default_tenant_conf: TenantConf::default(),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
             broker_keepalive_interval: Duration::from_secs(5000),
             log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 1684ca3c64..76ca183c9a 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -267,7 +267,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
         }
 
         let nread = file.read_to_end(&mut buf)?;
-        if nread != WAL_SEGMENT_SIZE - offset as usize {
+        if nread != WAL_SEGMENT_SIZE - offset {
             // Maybe allow this for .partial files?
             error!("read only {} bytes from WAL file", nread);
         }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fd4353a421..9b52fdaf68 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -444,9 +444,7 @@ impl PageServerHandler {
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| {
-            import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)
-        })?;
+        tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -658,7 +656,7 @@ impl PageServerHandler {
         tokio::task::block_in_place(|| {
             let basebackup =
                 basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
-            tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str());
+            tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
             basebackup.send_tarball()
         })?;
         pgb.write_message(&BeMessage::CopyDone)?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 77910bceda..793dddef01 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -710,14 +710,14 @@ impl<'a> DatadirModification<'a> {
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
-        if r == None || r == Some(false) {
+        if r.is_none() || r == Some(false) {
             // The dbdir entry didn't exist, or it contained a
             // 'false'. The 'insert' call already updated it with
             // 'true', now write the updated 'dbdirs' map back.
             let buf = DbDirectory::ser(&dbdir)?;
             self.put(DBDIR_KEY, Value::Image(buf.into()));
         }
-        if r == None {
+        if r.is_none() {
             // Create RelDirectory
             let buf = RelDirectory::ser(&RelDirectory {
                 rels: HashSet::new(),
@@ -1095,9 +1095,7 @@ impl<'a> DatadirModification<'a> {
                 // work directly with Images, and we never need to read actual
                 // data pages. We could handle this if we had to, by calling
                 // the walredo manager, but let's keep it simple for now.
-                return PageReconstructResult::from(anyhow::anyhow!(
-                    "unexpected pending WAL record"
-                ));
+                PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1425,7 +1423,7 @@ fn twophase_key_range(xid: TransactionId) -> Range<Key> {
         field2: 0,
         field3: 0,
         field4: 0,
-        field5: if overflowed { 1 } else { 0 },
+        field5: u8::from(overflowed),
         field6: next_xid,
     }
 }
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index a2337e8fd6..6883c11473 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -519,9 +519,9 @@ impl RemoteTimelineClient {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
                 .layer_metadata
-                .iter()
+                .values()
                 // If we don't have the file size for the layer, don't account for it in the metric.
-                .map(|(_, ilmd)| ilmd.file_size.unwrap_or(0))
+                .map(|ilmd| ilmd.file_size.unwrap_or(0))
                 .sum()
         } else {
             0
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1240a3b4fb..4129c205ad 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -337,7 +337,7 @@ impl TimelineUninitMark {
         let uninit_mark_parent = uninit_mark_file
             .parent()
             .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| {
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
             format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
         })?;
         crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
@@ -2321,12 +2321,12 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
     }
 }
 
 fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
-    fs::remove_dir_all(&timeline_dir)
+    fs::remove_dir_all(timeline_dir)
         .or_else(|e| {
             if e.kind() == std::io::ErrorKind::NotFound {
                 // we can leave the uninit mark without a timeline dir,
@@ -2342,7 +2342,7 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a
                 timeline_dir.display()
             )
         })?;
-    fs::remove_file(&uninit_mark).with_context(|| {
+    fs::remove_file(uninit_mark).with_context(|| {
         format!(
             "Failed to remove timeline uninit mark file {}",
             uninit_mark.display()
@@ -2442,7 +2442,7 @@ fn try_create_target_tenant_dir(
         anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
     });
 
-    fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| {
+    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
             "failed to move tenant {} temporary directory {} into the permanent one {}",
             tenant_id,
@@ -2496,9 +2496,9 @@ fn run_initdb(
     );
 
     let initdb_output = Command::new(&initdb_bin_path)
-        .args(&["-D", &initdb_target_dir.to_string_lossy()])
-        .args(&["-U", &conf.superuser])
-        .args(&["-E", "utf8"])
+        .args(["-D", &initdb_target_dir.to_string_lossy()])
+        .args(["-U", &conf.superuser])
+        .args(["-E", "utf8"])
         .arg("--no-instructions")
         // This is only used for a temporary installation that is deleted shortly after,
         // so no need to fsync it
@@ -2660,9 +2660,11 @@ pub mod harness {
 
             // Disable automatic GC and compaction to make the unit tests more deterministic.
             // The tests perform them manually if needed.
-            let mut tenant_conf = TenantConf::dummy_conf();
-            tenant_conf.gc_period = Duration::ZERO;
-            tenant_conf.compaction_period = Duration::ZERO;
+            let tenant_conf = TenantConf {
+                gc_period: Duration::ZERO,
+                compaction_period: Duration::ZERO,
+                ..TenantConf::default()
+            };
 
             let tenant_id = TenantId::generate();
             fs::create_dir_all(conf.tenant_path(&tenant_id))?;
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 33255dbd82..88dff32b76 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
         off += keys_len as u64;
 
         let values_off = off as usize;
-        let values_len = num_children as usize * VALUE_SZ as usize;
+        let values_len = num_children as usize * VALUE_SZ;
         //off += values_len as u64;
 
         let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
@@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
         while low < high {
             let mid = low + size / 2;
 
-            let key_off = mid as usize * self.suffix_len as usize;
+            let key_off = mid * self.suffix_len as usize;
             let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
             // Does this match?
             keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
@@ -328,7 +328,7 @@ where
             while idx < node.num_children as usize {
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                 #[allow(clippy::collapsible_if)]
                 if node.level == 0 {
                     // leaf
@@ -368,7 +368,7 @@ where
                 key_off -= suffix_len;
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx as usize);
+                let value = node.value(idx);
                 #[allow(clippy::collapsible_if)]
                 if node.level == 0 {
                     // leaf
@@ -629,7 +629,7 @@ impl<const L: usize> BuildNode<L> {
         self.keys.extend(&key[self.prefix.len()..]);
         self.values.extend(value.0);
 
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
 
         self.size += self.suffix_len + VALUE_SZ;
@@ -674,7 +674,7 @@ impl<const L: usize> BuildNode<L> {
         self.size -= prefix_len * self.num_children as usize;
         self.size += prefix_len;
 
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
 
         true
@@ -684,7 +684,7 @@ impl<const L: usize> BuildNode<L> {
     /// Serialize the node to on-disk format.
     ///
     fn pack(&self) -> Bytes {
-        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
         assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
         assert!(self.num_children > 0);
 
@@ -940,7 +940,7 @@ mod tests {
             let t = -(f64::ln(u));
             let key_int = (t * 1000000.0) as u128;
 
-            all_data.insert(key_int as u128, idx as u64);
+            all_data.insert(key_int, idx as u64);
         }
 
         // Build a tree from it
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 0774fa42a6..c433e65ad2 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -91,7 +91,7 @@ impl EphemeralFile {
                 break;
             }
 
-            off += n as usize;
+            off += n;
         }
         Ok(())
     }
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs
index 1e129fc01d..4b43328f35 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -569,7 +569,7 @@ impl ImageLayerWriterInner {
                 lsn: self.lsn,
             },
         );
-        std::fs::rename(self.path, &final_path)?;
+        std::fs::rename(self.path, final_path)?;
 
         trace!("created image layer {}", layer.path().display());
 
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index f3a0a5171a..297cccbe30 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -255,8 +255,7 @@ pub fn save_metadata(
     // fsync the parent directory to ensure the directory entry is durable
     if first_save {
         let timeline_dir = File::open(
-            &path
-                .parent()
+            path.parent()
                 .expect("Metadata should always have a parent dir"),
         )?;
         timeline_dir.sync_all()?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f4288fea36..25a9e1ec51 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1327,10 +1327,8 @@ impl Timeline {
                     index_part.timeline_layers.len()
                 );
                 remote_client.init_upload_queue(index_part)?;
-                let local_only_filenames = self
-                    .create_remote_layers(index_part, local_layers, disk_consistent_lsn)
-                    .await?;
-                local_only_filenames
+                self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
+                    .await?
             }
             None => {
                 info!("initializing upload queue as empty");
@@ -3425,9 +3423,9 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
     let mut new_path = path.to_owned();
 
     for i in 0u32.. {
-        new_path.set_file_name(format!("{}.{}.old", filename, i));
+        new_path.set_file_name(format!("{filename}.{i}.old"));
         if !new_path.exists() {
-            std::fs::rename(&path, &new_path)?;
+            std::fs::rename(path, &new_path)?;
             return Ok(());
         }
     }
diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs
index 1204d1abd8..8569c70217 100644
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -191,11 +191,10 @@ impl TenantConfOpt {
     }
 }
 
-impl TenantConf {
-    pub fn default() -> TenantConf {
+impl Default for TenantConf {
+    fn default() -> Self {
         use defaults::*;
-
-        TenantConf {
+        Self {
             checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
             checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
                 .expect("cannot parse default checkpoint timeout"),
@@ -220,29 +219,4 @@ impl TenantConf {
             trace_read_requests: false,
         }
     }
-
-    pub fn dummy_conf() -> Self {
-        TenantConf {
-            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: Duration::from_secs(600),
-            compaction_target_size: 4 * 1024 * 1024,
-            compaction_period: Duration::from_secs(10),
-            compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
-            gc_horizon: defaults::DEFAULT_GC_HORIZON,
-            gc_period: Duration::from_secs(10),
-            image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: Duration::from_secs(60 * 60),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .unwrap(),
-            lagging_wal_timeout: humantime::parse_duration(
-                defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
-            )
-            .unwrap(),
-            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .unwrap(),
-            trace_read_requests: false,
-        }
-    }
 }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index e3453dfe06..26a77c02d4 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -317,7 +317,7 @@ impl<'a> WalIngest<'a> {
             spcnode: blk.rnode_spcnode,
             dbnode: blk.rnode_dbnode,
             relnode: blk.rnode_relnode,
-            forknum: blk.forknum as u8,
+            forknum: blk.forknum,
         };
 
         //
@@ -1131,7 +1131,7 @@ mod tests {
     async fn test_relsize() -> Result<()> {
         let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A)?;
@@ -1155,7 +1155,7 @@ mod tests {
             .no_ondemand_download()?;
         m.commit()?;
 
-        assert_current_logical_size(&*tline, Lsn(0x50));
+        assert_current_logical_size(&tline, Lsn(0x50));
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
@@ -1239,7 +1239,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x60));
         walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
         m.commit()?;
-        assert_current_logical_size(&*tline, Lsn(0x60));
+        assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
         assert_eq!(
@@ -1347,7 +1347,7 @@ mod tests {
     async fn test_drop_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
@@ -1416,7 +1416,7 @@ mod tests {
     async fn test_truncate_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1554,7 +1554,7 @@ mod tests {
     async fn test_large_rel() -> Result<()> {
         let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&*tline)?;
+        let mut walingest = init_walingest_test(&tline)?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1567,7 +1567,7 @@ mod tests {
             m.commit()?;
         }
 
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
             tline
@@ -1587,7 +1587,7 @@ mod tests {
                 .no_ondemand_download()?,
             RELSEG_SIZE
         );
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         // Truncate another block
         lsn += 0x10;
@@ -1600,7 +1600,7 @@ mod tests {
                 .no_ondemand_download()?,
             RELSEG_SIZE - 1
         );
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         // Truncate to 1500, and then truncate all the way down to 0, one block at a time
         // This tests the behavior at segment boundaries
@@ -1619,7 +1619,7 @@ mod tests {
 
             size -= 1;
         }
-        assert_current_logical_size(&*tline, Lsn(lsn));
+        assert_current_logical_size(&tline, Lsn(lsn));
 
         Ok(())
     }
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index aeb7601af7..8b60e59305 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -805,7 +805,7 @@ fn wal_stream_connection_config(
     auth_token: Option<&str>,
 ) -> anyhow::Result<PgConnectionConfig> {
     let (host, port) =
-        parse_host_port(&listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
     let port = port.unwrap_or(5432);
     Ok(PgConnectionConfig::new_host_port(host, port)
         .extend_options([
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index ca7cfb7413..7cf489562b 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -409,7 +409,7 @@ impl PostgresRedoManager {
                     key
                 );
                 for &xid in xids {
-                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
                     let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
 
@@ -459,7 +459,7 @@ impl PostgresRedoManager {
                     key
                 );
                 for &xid in xids {
-                    let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
                     let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                     let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
 
@@ -647,7 +647,7 @@ impl PostgresRedoProcess {
 
         info!("running initdb in {}", datadir.display());
         let initdb = Command::new(pg_bin_dir_path.join("initdb"))
-            .args(&["-D", &datadir.to_string_lossy()])
+            .args(["-D", &datadir.to_string_lossy()])
             .arg("-N")
             .env_clear()
             .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 89668465fa..424beccec9 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -48,7 +48,7 @@ impl ServerSecret {
 
         Self {
             iterations: 4096,
-            salt_base64: base64::encode(&mocked_salt),
+            salt_base64: base64::encode(mocked_salt),
             stored_key: ScramKey::default(),
             server_key: ScramKey::default(),
             doomed: true,
@@ -68,7 +68,7 @@ impl ServerSecret {
 
         Some(Self {
             iterations,
-            salt_base64: base64::encode(&salt),
+            salt_base64: base64::encode(salt),
             stored_key: password.client_key().sha256(),
             server_key: password.server_key(),
             doomed: false,
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index f4a0f8520c..ba5e453e41 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -239,7 +239,7 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
         Ok((
             FileStorage::restore_new(ttid, conf)?,
             FileStorage::load_control_file_conf(conf, ttid)?,
@@ -250,7 +250,7 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, SafeKeeperState)> {
-        fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir");
+        fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
         let state = SafeKeeperState::empty();
         let storage = FileStorage::create_new(ttid, conf, state.clone())?;
         Ok((storage, state))
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index d4d3d37737..b21770686c 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -425,7 +425,7 @@ impl Collector for TimelineCollector {
                 .set(tli.num_computes as i64);
             self.acceptor_term
                 .with_label_values(labels)
-                .set(tli.persisted_state.acceptor_state.term as u64);
+                .set(tli.persisted_state.acceptor_state.term);
             self.written_wal_bytes
                 .with_label_values(labels)
                 .set(tli.wal_storage.write_wal_bytes);
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index ae4d4cce09..fc971ca753 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -346,9 +346,7 @@ impl WalBackupTask {
                         backup_lsn, commit_lsn, e
                     );
 
-                    if retry_attempt < u32::MAX {
-                        retry_attempt += 1;
-                    }
+                    retry_attempt = retry_attempt.saturating_add(1);
                 }
             }
         }
@@ -387,7 +385,7 @@ async fn backup_single_segment(
 ) -> Result<()> {
     let segment_file_path = seg.file_path(timeline_dir)?;
     let remote_segment_path = segment_file_path
-        .strip_prefix(&workspace_dir)
+        .strip_prefix(workspace_dir)
         .context("Failed to strip workspace dir prefix")
         .and_then(RemotePath::new)
         .with_context(|| {
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 52368bb719..41457868fe 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -223,7 +223,7 @@ impl PhysicalStorage {
             // Rename partial file to completed file
             let (wal_file_path, wal_file_partial_path) =
                 wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(&wal_file_partial_path, &wal_file_path)?;
+            fs::rename(wal_file_partial_path, wal_file_path)?;
         } else {
             // otherwise, file can be reused later
             self.file = Some(file);
@@ -249,7 +249,7 @@ impl PhysicalStorage {
 
         while !buf.is_empty() {
             // Extract WAL location for this block
-            let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize;
+            let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size);
             let segno = self.write_lsn.segment_number(self.wal_seg_size);
 
             // If crossing a WAL boundary, only write up until we reach wal segment size.
@@ -366,7 +366,7 @@ impl Storage for PhysicalStorage {
             self.fdatasync_file(&mut unflushed_file)?;
         }
 
-        let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = end_pos.segment_offset(self.wal_seg_size);
         let segno = end_pos.segment_number(self.wal_seg_size);
 
         // Remove all segments after the given LSN.
@@ -383,7 +383,7 @@ impl Storage for PhysicalStorage {
             // Make segment partial once again
             let (wal_file_path, wal_file_partial_path) =
                 wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
-            fs::rename(&wal_file_path, &wal_file_partial_path)?;
+            fs::rename(wal_file_path, wal_file_partial_path)?;
         }
 
         // Update LSNs
@@ -416,7 +416,7 @@ fn remove_segments_from_disk(
     let mut min_removed = u64::MAX;
     let mut max_removed = u64::MIN;
 
-    for entry in fs::read_dir(&timeline_dir)? {
+    for entry in fs::read_dir(timeline_dir)? {
         let entry = entry?;
         let entry_path = entry.path();
         let fname = entry_path.file_name().unwrap();
@@ -499,7 +499,7 @@ impl WalReader {
 
         // How much to read and send in message? We cannot cross the WAL file
         // boundary, and we don't want send more than provided buffer.
-        let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let send_size = min(buf.len(), self.wal_seg_size - xlogoff);
 
         // Read some data from the file.
@@ -518,7 +518,7 @@ impl WalReader {
 
     /// Open WAL segment at the current position of the reader.
     async fn open_segment(&self) -> Result<Pin<Box<dyn AsyncRead>>> {
-        let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize;
+        let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
         let wal_file_path = self.timeline_dir.join(wal_file_name);
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index 1262bd9333..f3544a7cb8 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -160,7 +160,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
     for _i in 0..args.num_pubs {
         let c = None;
-        tokio::spawn(publish(c, args.num_subs as u64));
+        tokio::spawn(publish(c, args.num_subs));
     }
 
     h.await?;
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index d12a79a69f..8441aaf625 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -13,6 +13,10 @@ use proto::{
 
 // Code generated by protobuf.
 pub mod proto {
+    // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]`
+    // we don't use these types for anything but broker data transmission,
+    // so it's ok to ignore this one.
+    #![allow(clippy::derive_partial_eq_without_eq)]
     tonic::include_proto!("storage_broker");
 }
 

From 707d1c1c948ded4efee06ab18880c876639f3ce1 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 13:34:16 +0100
Subject: [PATCH 058/132] Fix vm-compute-image upload to dockerhub (#3181)

---
 .github/workflows/build_and_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6443a56afc..ff433decf7 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -733,13 +733,13 @@ jobs:
         run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
 
       - name: Push vm compute node v14 image to Docker Hub
-        run: crane push compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
 
       - name: Push compute node v15 image to Docker Hub
         run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
 
       - name: Push vm compute node v15 image to Docker Hub
-        run: crane push compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
 
       - name: Push rust image to Docker Hub
         run: crane push rust neondatabase/rust:pinned

From 201fedd65ca281d0e5b4f3d3a8ebef7c1a4108e7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 22 Dec 2022 12:40:39 +0000
Subject: [PATCH 059/132] tpch-compare: use rust image instead of rustlegacy
 (#3182)

---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 07e111b67c..59317f0a47 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -407,7 +407,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
 
     timeout-minutes: 360 # 6h

From 5a762744c7db4356753dcf0efa8e96d72f85d06d Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Wed, 21 Dec 2022 12:08:54 +0200
Subject: [PATCH 060/132] Collect core dump backtraces in compute_ctl.

Scan core dumps directory on exit. In case of existing core dumps
call gdb/lldb to get a backtrace and log it. By default look for
core dumps in postgres data directory as core.<pid>. That is how
core collection is configured in our k8s nodes (and a reasonable
convention in general).
---
 Dockerfile.compute-node-v14  |  3 +-
 Dockerfile.compute-node-v15  |  3 +-
 compute_tools/src/compute.rs | 69 +++++++++++++++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index 1ffabafd51..e7fba49bb1 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -204,7 +204,8 @@ RUN apt update &&  \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 USER postgres
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index 11cefcc2da..cd03525b97 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -204,7 +204,8 @@ RUN apt update &&  \
         libgeos-c1v5 \
         libgdal28 \
         libproj19 \
-        libprotobuf-c1 && \
+        libprotobuf-c1 \
+        gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 USER postgres
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index eceff0fc4e..7ebb98077a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,7 +23,7 @@ use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::info;
+use log::{info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
@@ -328,6 +328,9 @@ impl ComputeNode {
             .wait()
             .expect("failed to start waiting on Postgres process");
 
+        self.check_for_core_dumps()
+            .expect("failed to check for core dumps");
+
         Ok(ecode)
     }
 
@@ -343,4 +346,68 @@ impl ComputeNode {
         self.prepare_pgdata()?;
         self.run()
     }
+
+    // Look for core dumps and collect backtraces.
+    //
+    // EKS worker nodes have following core dump settings:
+    //   /proc/sys/kernel/core_pattern -> core
+    //   /proc/sys/kernel/core_uses_pid -> 1
+    //   ulimint -c -> unlimited
+    // which results in core dumps being written to postgres data directory as core.<pid>.
+    //
+    // Use that as a default location and pattern, except macos where core dumps are written
+    // to /cores/ directory by default.
+    fn check_for_core_dumps(&self) -> Result<()> {
+        let core_dump_dir = match std::env::consts::OS {
+            "macos" => Path::new("/cores/"),
+            _ => Path::new(&self.pgdata),
+        };
+
+        // Collect core dump paths if any
+        info!("checking for core dumps in {}", core_dump_dir.display());
+        let files = fs::read_dir(core_dump_dir)?;
+        let cores = files.filter_map(|entry| {
+            let entry = entry.ok()?;
+            let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
+            Some(entry.path())
+        });
+
+        // Print backtrace for each core dump
+        for core_path in cores {
+            warn!(
+                "core dump found: {}, collecting backtrace",
+                core_path.display()
+            );
+
+            // Try first with gdb
+            let backtrace = Command::new("gdb")
+                .args(["--batch", "-q", "-ex", "bt", &self.pgbin])
+                .arg(&core_path)
+                .output();
+
+            // Try lldb if no gdb is found -- that is handy for local testing on macOS
+            let backtrace = match backtrace {
+                Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    warn!("cannot find gdb, trying lldb");
+                    Command::new("lldb")
+                        .arg("-c")
+                        .arg(&core_path)
+                        .args(["--batch", "-o", "bt all", "-o", "quit"])
+                        .output()
+                }
+                _ => backtrace,
+            }?;
+
+            warn!(
+                "core dump backtrace: {}",
+                String::from_utf8_lossy(&backtrace.stdout)
+            );
+            warn!(
+                "debugger stderr: {}",
+                String::from_utf8_lossy(&backtrace.stderr)
+            );
+        }
+
+        Ok(())
+    }
 }

From 9b712159063df0571912436c7c807359a5d221d2 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Thu, 22 Dec 2022 13:49:52 +0100
Subject: [PATCH 061/132] Simplify some functions in compute_tools and fix typo
 errors in func name

---
 compute_tools/src/bin/compute_ctl.rs    |  2 +-
 compute_tools/src/checker.rs            |  2 +-
 compute_tools/src/compute.rs            | 23 +++-----------------
 compute_tools/src/monitor.rs            |  6 ++---
 compute_tools/src/pg_helpers.rs         | 27 ++++++++++-------------
 compute_tools/tests/pg_helpers_tests.rs | 29 +++++++++++++++++++++++++
 6 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 7786d7af9c..f3b787209d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -105,7 +105,7 @@ fn main() -> Result<()> {
         tenant,
         timeline,
         pageserver_connstr,
-        metrics: ComputeMetrics::new(),
+        metrics: ComputeMetrics::default(),
         state: RwLock::new(ComputeState::new()),
     };
     let compute = Arc::new(compute_state);
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index b6ba1692f9..ee1605c814 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -5,7 +5,7 @@ use tokio_postgres::NoTls;
 
 use crate::compute::ComputeNode;
 
-pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
+pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     let query = "
     CREATE TABLE IF NOT EXISTS health_check (
         id serial primary key,
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 7ebb98077a..c2c9ab2230 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -27,7 +27,7 @@ use log::{info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
-use crate::checker::create_writablity_check_data;
+use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -91,7 +91,7 @@ pub enum ComputeStatus {
     Failed,
 }
 
-#[derive(Serialize)]
+#[derive(Default, Serialize)]
 pub struct ComputeMetrics {
     pub sync_safekeepers_ms: AtomicU64,
     pub basebackup_ms: AtomicU64,
@@ -99,23 +99,6 @@ pub struct ComputeMetrics {
     pub total_startup_ms: AtomicU64,
 }
 
-impl ComputeMetrics {
-    pub fn new() -> Self {
-        Self {
-            sync_safekeepers_ms: AtomicU64::new(0),
-            basebackup_ms: AtomicU64::new(0),
-            config_ms: AtomicU64::new(0),
-            total_startup_ms: AtomicU64::new(0),
-        }
-    }
-}
-
-impl Default for ComputeMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl ComputeNode {
     pub fn set_status(&self, status: ComputeStatus) {
         self.state.write().unwrap().status = status;
@@ -292,7 +275,7 @@ impl ComputeNode {
         handle_databases(&self.spec, &mut client)?;
         handle_role_deletions(self, &mut client)?;
         handle_grants(self, &mut client)?;
-        create_writablity_check_data(&mut client)?;
+        create_writability_check_data(&mut client)?;
 
         // 'Close' connection
         drop(client);
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 58cdf796bc..1588f5d62e 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -74,10 +74,8 @@ fn watch_compute_activity(compute: &ComputeNode) {
                         }
                     }
 
-                    // Sort idle backend `state_change` timestamps. The last one corresponds
-                    // to the last activity.
-                    idle_backs.sort();
-                    if let Some(last) = idle_backs.last() {
+                    // Get idle backend `state_change` with the max timestamp.
+                    if let Some(last) = idle_backs.iter().max() {
                         last_active = *last;
                     }
                 }
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 289f223bda..ff422f1cf5 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -119,16 +119,9 @@ pub trait GenericOptionsSearch {
 impl GenericOptionsSearch for GenericOptions {
     /// Lookup option by name
     fn find(&self, name: &str) -> Option<String> {
-        match &self {
-            Some(ops) => {
-                let op = ops.iter().find(|s| s.name == name);
-                match op {
-                    Some(op) => op.value.clone(),
-                    None => None,
-                }
-            }
-            None => None,
-        }
+        let ops = self.as_ref()?;
+        let op = ops.iter().find(|s| s.name == name)?;
+        op.value.clone()
     }
 }
 
@@ -161,6 +154,14 @@ impl Role {
 }
 
 impl Database {
+    pub fn new(name: PgIdent, owner: PgIdent) -> Self {
+        Self {
+            name,
+            owner,
+            options: None,
+        }
+    }
+
     /// Serialize a list of database parameters into a Postgres-acceptable
     /// string of arguments.
     /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
@@ -219,11 +220,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
             &[],
         )?
         .iter()
-        .map(|row| Database {
-            name: row.get("datname"),
-            owner: row.get("owner"),
-            options: None,
-        })
+        .map(|row| Database::new(row.get("datname"), row.get("owner")))
         .collect();
 
     Ok(postgres_dbs)
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index 24cad4663a..431d9794bc 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -38,4 +38,33 @@ mod pg_helpers_tests {
 
         assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
     }
+
+    #[test]
+    fn generic_options_search() {
+        let generic_options: GenericOptions = Some(vec![
+            GenericOption {
+                name: "present_value".into(),
+                value: Some("value".into()),
+                vartype: "string".into(),
+            },
+            GenericOption {
+                name: "missed_value".into(),
+                value: None,
+                vartype: "int".into(),
+            },
+        ]);
+        assert_eq!(generic_options.find("present_value"), Some("value".into()));
+        assert_eq!(generic_options.find("missed_value"), None);
+        assert_eq!(generic_options.find("invalid_value"), None);
+
+        let empty_generic_options: GenericOptions = Some(vec![]);
+        assert_eq!(empty_generic_options.find("present_value"), None);
+        assert_eq!(empty_generic_options.find("missed_value"), None);
+        assert_eq!(empty_generic_options.find("invalid_value"), None);
+
+        let none_generic_options: GenericOptions = None;
+        assert_eq!(none_generic_options.find("present_value"), None);
+        assert_eq!(none_generic_options.find("missed_value"), None);
+        assert_eq!(none_generic_options.find("invalid_value"), None);
+    }
 }

From 63eb87bde35aff98188039bd426879942111446c Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 22 Dec 2022 15:47:24 +0200
Subject: [PATCH 062/132] Set default metric_collection_interval to 10 min,
 which is more reasonable for real usage

---
 pageserver/src/config.rs                      | 2 +-
 test_runner/regress/test_metric_collection.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 9334f88a7e..66f8a9f4b8 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -56,7 +56,7 @@ pub mod defaults {
     pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
         super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
 
-    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "60 s";
+    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
     pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     ///
     /// Default built-in configuration file.
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index fa1bf0fbb2..b171be3ac7 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -69,7 +69,8 @@ def test_metric_collection(
     # to trigger remote storage operations in a controlled way
     neon_env_builder.pageserver_config_override = (
         f"""
-    metric_collection_endpoint="{metric_collection_endpoint}"
+        metric_collection_interval="60s"
+        metric_collection_endpoint="{metric_collection_endpoint}"
     """
         + "tenant_config={pitr_interval = '0 sec'}"
     )

From 8544c5932937788f0187a389fef37d743472aa1c Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 21 Dec 2022 18:31:52 +0200
Subject: [PATCH 063/132] Fix flaky test_metrics_collection.py Only check that
 all metrics are present on the first request, because pageserver doesn't send
 unchanged metrics.

---
 test_runner/regress/test_metric_collection.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index b171be3ac7..a3b3609153 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -22,6 +22,7 @@ def httpserver_listen_address(port_distributor: PortDistributor):
 
 num_metrics_received = 0
 remote_uploaded = 0
+first_request = True
 
 
 #
@@ -46,7 +47,12 @@ def metrics_handler(request: Request) -> Response:
     for event in events:
         assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
 
-    assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+    global first_request
+    # check that all checks were sent
+    # but only on the first request, because we don't send non-changed metrics
+    if first_request:
+        assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+        first_request = False
 
     global num_metrics_received
     num_metrics_received += 1

From 5a496d82b0967211a4b2bc51dd3ccf71828dd683 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 15:37:17 +0100
Subject: [PATCH 064/132] Do not deploy storage and proxies to old staging
 (#3180)

We fully migrated out, this nodes will be soon decommissioned
---
 .github/workflows/build_and_test.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ff433decf7..b98974c5a1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -767,8 +767,6 @@ jobs:
       - id: set-matrix
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            echo "include=[$STAGING]" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT

From 72ab104733ea8ee9b4a59b2ea2ea2669a2437788 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 22 Dec 2022 19:21:53 +0400
Subject: [PATCH 065/132] Move zenith-1-sk-3 to zenith-1-sk-4 (#3164)

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index d22c845966..3122a43801 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -34,5 +34,5 @@ storage:
           console_region_id: aws-us-west-2
         zenith-1-sk-2:
           console_region_id: aws-us-west-2
-        zenith-1-sk-3:
+        zenith-1-sk-4:
           console_region_id: aws-us-west-2

From 7bc17b373e29c33efe7439a53b84757049415b8d Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 16:28:36 +0100
Subject: [PATCH 066/132] Fix calculate-deploy-targets (#3189)

Was broken in https://github.com/neondatabase/neon/pull/3180
---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b98974c5a1..48ed800450 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -767,6 +767,7 @@ jobs:
       - id: set-matrix
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "include=[]" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT

From c01f92c08141653644501167902bd586139ec9aa Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 22 Dec 2022 20:09:45 +0100
Subject: [PATCH 067/132] Fully remove old staging deploy (#3191)

---
 .github/ansible/staging.hosts.yaml            | 35 ------------
 .../staging.neon-storage-broker.yaml          | 56 ------------------
 .github/helm-values/staging.proxy-scram.yaml  | 57 -------------------
 .github/helm-values/staging.proxy.yaml        | 57 -------------------
 .github/workflows/build_and_test.yml          | 16 +++---
 5 files changed, 7 insertions(+), 214 deletions(-)
 delete mode 100644 .github/ansible/staging.hosts.yaml
 delete mode 100644 .github/helm-values/staging.neon-storage-broker.yaml
 delete mode 100644 .github/helm-values/staging.proxy-scram.yaml
 delete mode 100644 .github/helm-values/staging.proxy.yaml

diff --git a/.github/ansible/staging.hosts.yaml b/.github/ansible/staging.hosts.yaml
deleted file mode 100644
index 79acfd1d2a..0000000000
--- a/.github/ansible/staging.hosts.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-storage:
-  vars:
-    bucket_name: zenith-staging-storage-us-east-1
-    bucket_region: us-east-1
-    console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: http://storage-broker.staging.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: us-stage/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: development
-
-  children:
-    pageservers:
-      hosts:
-        zenith-us-stage-ps-2:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-3:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-ps-4:
-          console_region_id: aws-us-east-1
-
-    safekeepers:
-      hosts:
-        zenith-us-stage-sk-4:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-5:
-          console_region_id: aws-us-east-1
-        zenith-us-stage-sk-6:
-          console_region_id: aws-us-east-1
diff --git a/.github/helm-values/staging.neon-storage-broker.yaml b/.github/helm-values/staging.neon-storage-broker.yaml
deleted file mode 100644
index 6b21c286a1..0000000000
--- a/.github/helm-values/staging.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: staging
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.staging.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "development"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
deleted file mode 100644
index 66f9921c9a..0000000000
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
-  domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy-scram
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
deleted file mode 100644
index a22082e625..0000000000
--- a/.github/helm-values/staging.proxy.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Helm chart values for zenith-proxy.
-# This is a YAML-formatted file.
-
-image:
-  repository: neondatabase/neon
-
-settings:
-  authBackend: "link"
-  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
-  uri: "https://console.stage.neon.tech/psql_session/"
-  sentryEnvironment: "development"
-
-# -- Additional labels for zenith-proxy pods
-podLabels:
-  zenith_service: proxy
-  zenith_env: staging
-  zenith_region: us-east-1
-  zenith_region_slug: virginia
-
-exposedService:
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-proxy.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-proxy
-        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-proxy"
-      endpoints:
-        - port: http
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 48ed800450..17c698482c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -759,20 +759,18 @@ jobs:
   calculate-deploy-targets:
     runs-on: [ self-hosted, dev, x64 ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     outputs:
       matrix-include: ${{ steps.set-matrix.outputs.include }}
     steps:
       - id: set-matrix
         run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "include=[]" >> $GITHUB_OUTPUT
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
             PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
             echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
             exit 1
           fi
 
@@ -783,7 +781,7 @@ jobs:
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -827,7 +825,7 @@ jobs:
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
     if: |
       (github.ref_name == 'main') &&
       github.event_name != 'workflow_dispatch'
@@ -939,7 +937,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:
@@ -982,7 +980,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.ref_name == 'release' &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:

From 0bafb2a6c703f152bdd3a6ff194720d27eff0b4b Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Fri, 23 Dec 2022 15:39:59 +0200
Subject: [PATCH 068/132] Do more on-demand downloads where needed (#3194)

The PR aims to fix two missing redownloads in a flacky
test_remote_storage_upload_queue_retries[local_fs]
([example](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3190/release/3759194738/index.html#categories/80f1dcdd7c08252126be7e9f44fe84e6/8a70800f7ab13620/))

1. missing redownload during walreceiver work
```
2022-12-22T16:09:51.509891Z ERROR wal_connection_manager{tenant=fb62b97553e40f949de8bdeab7f93563 timeline=4f153bf6a58fd63832f6ee175638d049}: wal receiver task finished with an error: walreceiver connection handling failure

Caused by:
    Layer needs downloading

Stack backtrace:
   0: pageserver::tenant::timeline::PageReconstructResult<T>::no_ondemand_download
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:467:59
   1: pageserver::walingest::WalIngest::new
             at /__w/neon/neon/pageserver/src/walingest.rs:61:32
   2: pageserver::walreceiver::walreceiver_connection::handle_walreceiver_connection::{{closure}}
             at /__w/neon/neon/pageserver/src/walreceiver/walreceiver_connection.rs:178:25
....
```

That looks sad, but inevitable during the current approach: seems that
we need to wait for old layers to arrive in order to accept new data.

For that, `WalIngest::new` now started to return the
`PageReconstructResult`.
Sync methods from `import_datadir.rs` use `WalIngest::new` too, but both
of them import WAL during timeline creation, so no layers to download
are needed there, ergo the `PageReconstructResult` is converted to
`anyhow::Result` with `no_ondemand_download`.

2. missing redownload during compaction work
```
2022-12-22T16:09:51.090296Z ERROR compaction_loop{tenant_id=fb62b97553e40f949de8bdeab7f93563}:compact_timeline{timeline=4f153bf6a58fd63832f6ee175638d049}: could not compact, repartitioning keyspace failed: Layer needs downloading

Stack backtrace:
   0: pageserver::tenant::timeline::PageReconstructResult<T>::no_ondemand_download
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:467:59
   1: pageserver::pgdatadir_mapping::<impl pageserver::tenant::timeline::Timeline>::collect_keyspace::{{closure}}
             at /__w/neon/neon/pageserver/src/pgdatadir_mapping.rs:506:41
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
      pageserver::tenant::timeline::Timeline::repartition::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:2161:50
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
   2: pageserver::tenant::timeline::Timeline::compact::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant/timeline.rs:700:14
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
   3: <tracing::instrument::Instrumented<T> as core::future::future::Future>::poll
             at /github/home/.cargo/registry/src/github.com-1ecc6299db9ec823/tracing-0.1.37/src/instrument.rs:272:9
   4: pageserver::tenant::Tenant::compaction_iteration::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant.rs:1232:85
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
      pageserver::tenant_tasks::compaction_loop::{{closure}}::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant_tasks.rs:76:62
      <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
             at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
      pageserver::tenant_tasks::compaction_loop::{{closure}}
             at /__w/neon/neon/pageserver/src/tenant_tasks.rs:91:6
```
---
 pageserver/src/import_datadir.rs                   | 11 ++++++++---
 pageserver/src/pgdatadir_mapping.rs                | 13 ++++++-------
 pageserver/src/walingest.rs                        | 14 ++++++++------
 .../src/walreceiver/walreceiver_connection.rs      |  3 ++-
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 76ca183c9a..588b92c13f 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -237,14 +237,19 @@ fn import_slru<Reader: Read>(
 
 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
+fn import_wal(
+    walpath: &Path,
+    tline: &Timeline,
+    startpoint: Lsn,
+    endpoint: Lsn,
+) -> anyhow::Result<()> {
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint)?;
+    let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -362,7 +367,7 @@ pub fn import_wal_from_tar<Reader: Read>(
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn)?;
+    let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 793dddef01..82b1576145 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -8,7 +8,7 @@
 //!
 use super::tenant::PageReconstructResult;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::tenant::Timeline;
+use crate::tenant::{with_ondemand_download, Timeline};
 use crate::walrecord::NeonWalRecord;
 use crate::{repository::*, try_no_ondemand_download};
 use anyhow::Context;
@@ -503,12 +503,11 @@ impl Timeline {
             result.add_key(relmap_file_key(spcnode, dbnode));
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
-            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)
-                .no_ondemand_download()?
-                .iter()
-                .cloned()
-                .collect();
+            let mut rels: Vec<RelTag> =
+                with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
+                    .await?
+                    .into_iter()
+                    .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 26a77c02d4..031b80a6e0 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -33,10 +33,10 @@ use tracing::*;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructResult;
 use crate::tenant::Timeline;
-use crate::try_no_ondemand_download;
 use crate::try_page_reconstruct_result as try_prr;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use crate::{try_no_ondemand_download, try_page_reconstruct_result};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -55,14 +55,16 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
+    pub fn new(timeline: &Timeline, startpoint: Lsn) -> PageReconstructResult<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint).no_ondemand_download()?;
-        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+        let checkpoint_bytes = try_no_ondemand_download!(timeline.get_checkpoint(startpoint));
+        let checkpoint = try_page_reconstruct_result!(
+            CheckPoint::decode(&checkpoint_bytes).context("Failed to decode checkpoint bytes")
+        );
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
-        Ok(WalIngest {
+        PageReconstructResult::Success(WalIngest {
             timeline,
             checkpoint,
             checkpoint_modified: false,
@@ -1122,7 +1124,7 @@ mod tests {
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10))?;
+        let walingest = WalIngest::new(tline, Lsn(0x10)).no_ondemand_download()?;
 
         Ok(walingest)
     }
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index cc318cccc8..a98126e683 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -175,7 +175,8 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?;
+    let mut walingest =
+        with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
 
     while let Some(replication_message) = {
         select! {

From b77c33ee0644f00ea3fc0af252f11f7454f5c3fb Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Fri, 23 Dec 2022 15:40:37 +0200
Subject: [PATCH 069/132] Move tenant-related modules below `tenant` module
 (#3190)

No real code changes besides moving code around and adjusting the
imports.
---
 pageserver/benches/bench_layer_map.rs         |  3 +-
 pageserver/src/bin/pageserver.rs              | 35 +++++++++++++++-
 pageserver/src/lib.rs                         |  2 -
 pageserver/src/tenant.rs                      | 25 ++++-------
 pageserver/src/tenant/layer_map.rs            |  3 +-
 pageserver/src/tenant/storage_layer.rs        | 15 +++++--
 .../tenant/{ => storage_layer}/delta_layer.rs |  4 +-
 .../tenant/{ => storage_layer}/filename.rs    |  0
 .../tenant/{ => storage_layer}/image_layer.rs |  5 +--
 .../{ => storage_layer}/inmemory_layer.rs     |  3 +-
 .../{ => storage_layer}/remote_layer.rs       | 10 ++---
 .../storage_sync.rs}                          | 42 +++----------------
 .../storage_sync}/delete.rs                   |  0
 .../storage_sync}/download.rs                 |  5 +--
 .../storage_sync}/index.rs                    |  2 +-
 .../storage_sync}/upload.rs                   |  6 +--
 pageserver/src/tenant/timeline.rs             | 25 ++++++-----
 test_runner/regress/test_tenant_detach.py     |  2 +-
 18 files changed, 87 insertions(+), 100 deletions(-)
 rename pageserver/src/tenant/{ => storage_layer}/delta_layer.rs (99%)
 rename pageserver/src/tenant/{ => storage_layer}/filename.rs (100%)
 rename pageserver/src/tenant/{ => storage_layer}/image_layer.rs (99%)
 rename pageserver/src/tenant/{ => storage_layer}/inmemory_layer.rs (99%)
 rename pageserver/src/tenant/{ => storage_layer}/remote_layer.rs (94%)
 rename pageserver/src/{storage_sync2.rs => tenant/storage_sync.rs} (97%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/delete.rs (100%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/download.rs (98%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/index.rs (99%)
 rename pageserver/src/{storage_sync2 => tenant/storage_sync}/upload.rs (96%)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index a0c38e1e3a..6a01fdfc6f 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,8 +1,7 @@
 use anyhow::Result;
 use pageserver::repository::Key;
-use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::ValueReconstructState;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
 use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index e72a861be0..d12063f5aa 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,12 +7,13 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
+use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, storage_sync2, task_mgr,
+    http, page_cache, page_service, profiling, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
@@ -280,7 +281,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     };
 
     // Set up remote storage client
-    let remote_storage = storage_sync2::create_remote_storage_client(conf)?;
+    let remote_storage = create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
     BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
@@ -369,6 +370,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     })
 }
 
+fn create_remote_storage_client(
+    conf: &'static PageServerConf,
+) -> anyhow::Result<Option<GenericRemoteStorage>> {
+    let config = if let Some(config) = &conf.remote_storage_config {
+        config
+    } else {
+        // No remote storage configured.
+        return Ok(None);
+    };
+
+    // Create the client
+    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
+
+    // If `test_remote_failures` is non-zero, wrap the client with a
+    // wrapper that simulates failures.
+    if conf.test_remote_failures > 0 {
+        if !cfg!(feature = "testing") {
+            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
+        }
+        info!(
+            "Simulating remote failures for first {} attempts of each op",
+            conf.test_remote_failures
+        );
+        remote_storage =
+            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+    }
+
+    Ok(Some(remote_storage))
+}
+
 fn cli() -> Command {
     Command::new("Neon page server")
         .about("Materializes WAL stream to pages and serves them to the postgres")
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index e01eb12b7b..ae815fe421 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,8 +11,6 @@ pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
 pub mod repository;
-pub mod storage_sync2;
-pub use storage_sync2 as storage_sync;
 pub mod task_mgr;
 pub mod tenant;
 pub mod tenant_config;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4129c205ad..308130c799 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,18 +45,19 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::metadata::TimelineMetadata;
+use self::storage_sync::create_remote_timeline_client;
+use self::storage_sync::index::IndexPart;
+use self::storage_sync::RemoteTimelineClient;
 use crate::config::PageServerConf;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
 use crate::repository::GcResult;
-use crate::storage_sync::create_remote_timeline_client;
-use crate::storage_sync::index::IndexPart;
-use crate::storage_sync::list_remote_timelines;
-use crate::storage_sync::RemoteTimelineClient;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::metadata::load_metadata;
+use crate::tenant::storage_layer::DeltaLayer;
+use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::tenant_config::TenantConfOpt;
 use crate::virtual_file::VirtualFile;
@@ -74,18 +75,14 @@ use utils::{
 
 mod blob_io;
 pub mod block_io;
-mod delta_layer;
 mod disk_btree;
 pub(crate) mod ephemeral_file;
-pub mod filename;
-mod image_layer;
-mod inmemory_layer;
 pub mod layer_map;
-mod remote_layer;
 
 pub mod metadata;
 mod par_fsync;
 pub mod storage_layer;
+mod storage_sync;
 
 mod timeline;
 
@@ -647,7 +644,7 @@ impl Tenant {
             .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
 
         let remote_timelines =
-            list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
+            storage_sync::list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
 
         info!("found {} timelines", remote_timelines.len());
 
@@ -2541,12 +2538,8 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<()
     file.read_exact_at(&mut header_buf, 0)?;
 
     match u16::from_be_bytes(header_buf) {
-        crate::IMAGE_FILE_MAGIC => {
-            image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)?
-        }
-        crate::DELTA_FILE_MAGIC => {
-            delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)?
-        }
+        crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?,
+        crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?,
         magic => bail!("unrecognized magic identifier: {:?}", magic),
     }
 
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 0202ccfa6a..f5182926e4 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -12,7 +12,6 @@
 
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::inmemory_layer::InMemoryLayer;
 use crate::tenant::storage_layer::{range_eq, range_overlaps};
 use amplify_num::i256;
 use anyhow::Result;
@@ -27,7 +26,7 @@ use std::sync::Arc;
 use tracing::*;
 use utils::lsn::Lsn;
 
-use super::storage_layer::Layer;
+use super::storage_layer::{InMemoryLayer, Layer};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 8bfac5df8e..d87a248bdf 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,6 +1,10 @@
-//!
 //! Common traits and structs for layers
-//!
+
+mod delta_layer;
+mod filename;
+mod image_layer;
+mod inmemory_layer;
+mod remote_layer;
 
 use crate::repository::{Key, Value};
 use crate::walrecord::NeonWalRecord;
@@ -15,8 +19,11 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::remote_layer::RemoteLayer;
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf};
+pub use image_layer::{ImageLayer, ImageLayerWriter};
+pub use inmemory_layer::InMemoryLayer;
+pub use remote_layer::RemoteLayer;
 
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
similarity index 99%
rename from pageserver/src/tenant/delta_layer.rs
rename to pageserver/src/tenant/storage_layer/delta_layer.rs
index 5b724b6263..302ba2dc78 100644
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,7 +29,6 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{DeltaFileName, PathOrConf};
 use crate::tenant::storage_layer::{
     PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -54,8 +53,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::{Layer, LayerIter, LayerKeyIter};
+use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf};
 
 ///
 /// Header stored in the beginning of the file
diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs
similarity index 100%
rename from pageserver/src/tenant/filename.rs
rename to pageserver/src/tenant/storage_layer/filename.rs
diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
similarity index 99%
rename from pageserver/src/tenant/image_layer.rs
rename to pageserver/src/tenant/storage_layer/image_layer.rs
index 4b43328f35..9a26fce73b 100644
--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,7 +25,6 @@ use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::filename::{ImageFileName, PathOrConf};
 use crate::tenant::storage_layer::{
     PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
@@ -51,8 +50,8 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::{Layer, LayerIter};
+use super::filename::{ImageFileName, LayerFileName, PathOrConf};
+use super::{Layer, LayerIter};
 
 ///
 /// Header stored in the beginning of the file
diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
similarity index 99%
rename from pageserver/src/tenant/inmemory_layer.rs
rename to pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 35b0e98591..93356a9d8c 100644
--- a/pageserver/src/tenant/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -8,7 +8,6 @@ use crate::config::PageServerConf;
 use crate::repository::{Key, Value};
 use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
-use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
@@ -28,7 +27,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use std::sync::RwLock;
 
-use super::storage_layer::Layer;
+use super::{DeltaLayer, DeltaLayerWriter, Layer};
 
 thread_local! {
     /// A buffer for serializing object during [`InMemoryLayer::put_value`].
diff --git a/pageserver/src/tenant/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
similarity index 94%
rename from pageserver/src/tenant/remote_layer.rs
rename to pageserver/src/tenant/storage_layer/remote_layer.rs
index affe8ca0a8..c2c11d7bff 100644
--- a/pageserver/src/tenant/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -3,11 +3,8 @@
 //!
 use crate::config::PageServerConf;
 use crate::repository::Key;
-use crate::storage_sync::index::LayerFileMetadata;
-use crate::tenant::delta_layer::DeltaLayer;
-use crate::tenant::filename::{DeltaFileName, ImageFileName};
-use crate::tenant::image_layer::ImageLayer;
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::storage_sync::index::LayerFileMetadata;
 use anyhow::{bail, Result};
 use std::ops::Range;
 use std::path::PathBuf;
@@ -18,8 +15,9 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::LayerFileName;
-use super::storage_layer::{LayerIter, LayerKeyIter, PersistentLayer};
+use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
+use super::image_layer::ImageLayer;
+use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer};
 
 #[derive(Debug)]
 pub struct RemoteLayer {
diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/tenant/storage_sync.rs
similarity index 97%
rename from pageserver/src/storage_sync2.rs
rename to pageserver/src/tenant/storage_sync.rs
index 6883c11473..ef57f91a02 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/tenant/storage_sync.rs
@@ -221,15 +221,12 @@ use tracing::{info_span, Instrument};
 
 use utils::lsn::Lsn;
 
-use self::index::IndexPart;
-
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
 use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
-use crate::tenant::filename::LayerFileName;
+use crate::tenant::storage_sync::index::LayerFileMetadata;
 use crate::{
     config::PageServerConf,
-    storage_sync::index::LayerFileMetadata,
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::BACKGROUND_RUNTIME,
@@ -239,6 +236,10 @@ use crate::{
 
 use utils::id::{TenantId, TimelineId};
 
+use self::index::IndexPart;
+
+use super::storage_layer::LayerFileName;
+
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -1178,39 +1179,6 @@ pub fn create_remote_timeline_client(
     })
 }
 
-///
-/// Create GenericRemoteStorage client from the pageserver config
-///
-pub fn create_remote_storage_client(
-    conf: &'static PageServerConf,
-) -> anyhow::Result<Option<GenericRemoteStorage>> {
-    let config = if let Some(config) = &conf.remote_storage_config {
-        config
-    } else {
-        // No remote storage configured.
-        return Ok(None);
-    };
-
-    // Create the client
-    let mut remote_storage = GenericRemoteStorage::from_config(config)?;
-
-    // If `test_remote_failures` is non-zero, wrap the client with a
-    // wrapper that simulates failures.
-    if conf.test_remote_failures > 0 {
-        if !cfg!(feature = "testing") {
-            anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
-        }
-        info!(
-            "Simulating remote failures for first {} attempts of each op",
-            conf.test_remote_failures
-        );
-        remote_storage =
-            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
-    }
-
-    Ok(Some(remote_storage))
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/storage_sync2/delete.rs b/pageserver/src/tenant/storage_sync/delete.rs
similarity index 100%
rename from pageserver/src/storage_sync2/delete.rs
rename to pageserver/src/tenant/storage_sync/delete.rs
diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/tenant/storage_sync/download.rs
similarity index 98%
rename from pageserver/src/storage_sync2/download.rs
rename to pageserver/src/tenant/storage_sync/download.rs
index 4256767020..422728d1f3 100644
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/tenant/storage_sync/download.rs
@@ -14,14 +14,13 @@ use tokio::io::AsyncWriteExt;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
 use crate::config::PageServerConf;
-use crate::storage_sync::index::LayerFileMetadata;
-use crate::tenant::filename::LayerFileName;
+use crate::tenant::storage_layer::LayerFileName;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
-use super::index::{IndexPart, IndexPartUnclean};
+use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
 use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/tenant/storage_sync/index.rs
similarity index 99%
rename from pageserver/src/storage_sync2/index.rs
rename to pageserver/src/tenant/storage_sync/index.rs
index bb58a34969..017be29726 100644
--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/tenant/storage_sync/index.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use tracing::warn;
 
-use crate::tenant::{filename::LayerFileName, metadata::TimelineMetadata};
+use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName};
 
 use utils::lsn::Lsn;
 
diff --git a/pageserver/src/storage_sync2/upload.rs b/pageserver/src/tenant/storage_sync/upload.rs
similarity index 96%
rename from pageserver/src/storage_sync2/upload.rs
rename to pageserver/src/tenant/storage_sync/upload.rs
index 57a524a22d..08cea6268b 100644
--- a/pageserver/src/storage_sync2/upload.rs
+++ b/pageserver/src/tenant/storage_sync/upload.rs
@@ -5,12 +5,12 @@ use fail::fail_point;
 use std::path::Path;
 use tokio::fs;
 
-use super::index::IndexPart;
-use crate::config::PageServerConf;
-use crate::storage_sync::LayerFileMetadata;
+use crate::{config::PageServerConf, tenant::storage_sync::index::IndexPart};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};
 
+use super::index::LayerFileMetadata;
+
 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
     conf: &'static PageServerConf,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 25a9e1ec51..55ede57e53 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,15 +23,13 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::storage_sync::index::IndexPart;
-use crate::storage_sync::RemoteTimelineClient;
-use crate::tenant::remote_layer::RemoteLayer;
+use crate::tenant::storage_layer::{
+    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
+    RemoteLayer,
+};
+use crate::tenant::storage_sync::{self, index::LayerFileMetadata};
 use crate::tenant::{
-    delta_layer::{DeltaLayer, DeltaLayerWriter},
     ephemeral_file::is_ephemeral_file,
-    filename::{DeltaFileName, ImageFileName},
-    image_layer::{ImageLayer, ImageLayerWriter},
-    inmemory_layer::InMemoryLayer,
     layer_map::{LayerMap, SearchResult},
     metadata::{save_metadata, TimelineMetadata},
     par_fsync,
@@ -56,6 +54,7 @@ use utils::{
     simple_rcu::{Rcu, RcuReadGuard},
 };
 
+use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
@@ -64,10 +63,10 @@ use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
-use crate::{page_cache, storage_sync::index::LayerFileMetadata};
 
-use super::filename::LayerFileName;
-use super::storage_layer::Layer;
+use super::storage_layer::{DeltaLayer, ImageLayer, Layer};
+use super::storage_sync::index::IndexPart;
+use super::storage_sync::RemoteTimelineClient;
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum FlushLoopState {
@@ -97,7 +96,7 @@ pub struct Timeline {
     walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
 
     /// Remote storage client.
-    /// See [`storage_sync2`] module comment for details.
+    /// See [`storage_sync`] module comment for details.
     pub remote_client: Option<Arc<RemoteTimelineClient>>,
 
     // What page versions do we hold in the repository? If we get a
@@ -1123,7 +1122,7 @@ impl Timeline {
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
-            } else if crate::storage_sync::is_temp_download_file(&direntry_path) {
+            } else if storage_sync::is_temp_download_file(&direntry_path) {
                 info!(
                     "skipping temp download file, reconcile_with_remote will resume / clean up: {}",
                     fname
@@ -1293,7 +1292,7 @@ impl Timeline {
     /// 3. Schedule upload of local-only layer files (which will then also update the remote
     ///    IndexPart to include the new layer files).
     ///
-    /// Refer to the `storage_sync2` module comment for more context.
+    /// Refer to the `storage_sync` module comment for more context.
     ///
     /// # TODO
     /// May be a bit cleaner to do things based on populated remote client,
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8bf0fb7548..6963a57542 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -470,7 +470,7 @@ def test_ignore_while_attaching(
     pageserver_http.tenant_attach(tenant_id)
     # Run ignore on the task, thereby cancelling the attach.
     # XXX This should take priority over attach, i.e., it should cancel the attach task.
-    # But neither the failpoint, nor the proper storage_sync2 download functions,
+    # But neither the failpoint, nor the proper storage_sync download functions,
     # are sensitive to task_mgr::shutdown.
     # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
     # So, for now, effectively, this ignore here will block until attach task completes.

From 1468c65ffb70dcc072cb341bf6aa6f800bce6840 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 22 Dec 2022 16:32:29 +0200
Subject: [PATCH 070/132] Enable billing metric_collection_endpoint on staging

---
 .github/ansible/staging.eu-west-1.hosts.yaml | 2 ++
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index cfcc3a9ae8..fce450ed39 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -6,6 +6,8 @@ storage:
     broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 78a4582e57..11c7992444 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -6,6 +6,8 @@ storage:
     broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"

From 1137b58b4d476370c36af93f5da46f9ae2562303 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 26 Dec 2022 18:21:41 +0200
Subject: [PATCH 071/132] Fix LayerMap::search to not return delta layer
 preceeding image layer (#3197)

While @bojanserafimov is still working on best replacement of R-Tree in
layer_map.rs there is obvious pitfall in the current `search` method
implementation: is returns delta layer even if there is image layer if
greater LSN. I think that it should be fixed.
---
 pageserver/src/tenant/layer_map.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index f5182926e4..4ff2d4b0d8 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -326,14 +326,16 @@ where
                 latest_delta.replace(Arc::clone(l));
                 break;
             }
-            // this layer's end LSN is smaller than the requested point. If there's
-            // nothing newer, this is what we need to return. Remember this.
-            if let Some(old_candidate) = &latest_delta {
-                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
+                // this layer's end LSN is smaller than the requested point. If there's
+                // nothing newer, this is what we need to return. Remember this.
+                if let Some(old_candidate) = &latest_delta {
+                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
+                        latest_delta.replace(Arc::clone(l));
+                    }
+                } else {
                     latest_delta.replace(Arc::clone(l));
                 }
-            } else {
-                latest_delta.replace(Arc::clone(l));
             }
         }
         if let Some(l) = latest_delta {

From 5826e19b56fc86338204c35bd0916143bf425a47 Mon Sep 17 00:00:00 2001
From: Anna Stepanyan <anna.stepanyan@neon.tech>
Date: Tue, 27 Dec 2022 10:25:19 +0100
Subject: [PATCH 072/132] update the grafana links in the PR release template
 (#3156)

---
 .github/PULL_REQUEST_TEMPLATE/release-pr.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
index 8fcc3bd4af..a848077e6a 100644
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -14,7 +14,7 @@
 - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
 - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
 - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
-- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
-- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
+- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1)
+- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time)
 
 <!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

From 140c0edac8a8322efaf88fc15d11977b077869a5 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 27 Dec 2022 14:42:51 +0200
Subject: [PATCH 073/132] Yet another port of local file system cache (#2622)

---
 pgxn/neon/Makefile           |   3 +-
 pgxn/neon/file_cache.c       | 597 +++++++++++++++++++++++++++++++++++
 pgxn/neon/libpagestore.c     |   1 +
 pgxn/neon/neon--1.0.sql      |  10 +
 pgxn/neon/pagestore_client.h |   7 +
 pgxn/neon/pagestore_smgr.c   |  14 +
 6 files changed, 631 insertions(+), 1 deletion(-)
 create mode 100644 pgxn/neon/file_cache.c

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 7f4e30a12e..ec377dbb1e 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,11 +4,12 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
+	neon.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
-	neon.o \
 	walproposer.o \
 	walproposer_utils.o
 
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
new file mode 100644
index 0000000000..96c2461e2d
--- /dev/null
+++ b/pgxn/neon/file_cache.c
@@ -0,0 +1,597 @@
+/*
+ *
+ * file_cache.c
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  pgxn/neon/file_cache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/file.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "pagestore_client.h"
+#include "access/parallel.h"
+#include "postmaster/bgworker.h"
+#include "storage/relfilenode.h"
+#include "storage/buf_internals.h"
+#include "storage/latch.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+#include "storage/fd.h"
+#include "storage/pg_shmem.h"
+#include "storage/buf_internals.h"
+
+/*
+ * Local file cache is used to temporary store relations pages in local file system.
+ * All blocks of all relations are stored inside one file and addressed using shared hash map.
+ * Currently LRU eviction policy based on L2 list is used as replacement algorithm.
+ * As far as manipulation of L2-list requires global critical section, we are not using partitioned hash.
+ * Also we are using exclusive lock even for read operation because LRU requires relinking element in L2 list.
+ * If this lock become a bottleneck, we can consider other eviction strategies, for example clock algorithm.
+ *
+ * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
+ * its consistency.
+ */
+
+/* Local file storage allocation chunk.
+ * Should be power of two and not less than 32. Using larger than page chunks can
+ * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
+ *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
+ *    1Mb chunks can reduce hash map size to 320Mb.
+ * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
+ */
+#define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
+#define MB					((uint64)1024*1024)
+
+#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
+
+typedef struct FileCacheEntry
+{
+	BufferTag	key;
+	uint32		offset;
+	uint32		access_count;
+	uint32		bitmap[BLOCKS_PER_CHUNK/32];
+	dlist_node	lru_node; /* LRU list node */
+} FileCacheEntry;
+
+typedef struct FileCacheControl
+{
+	uint32 size; /* size of cache file in chunks */
+	dlist_head lru; /* double linked list for LRU replacement algorithm */
+} FileCacheControl;
+
+static HTAB* lfc_hash;
+static int   lfc_desc;
+static LWLockId lfc_lock;
+static int   lfc_max_size;
+static int   lfc_size_limit;
+static char* lfc_path;
+static  FileCacheControl* lfc_ctl;
+static shmem_startup_hook_type prev_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
+#endif
+
+static void
+lfc_shmem_startup(void)
+{
+	bool found;
+	static HASHCTL info;
+
+	if (prev_shmem_startup_hook)
+	{
+		prev_shmem_startup_hook();
+	}
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	if (!found)
+	{
+		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
+		info.keysize = sizeof(BufferTag);
+		info.entrysize = sizeof(FileCacheEntry);
+		lfc_hash = ShmemInitHash("lfc_hash",
+								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
+								 lfc_size+1, lfc_size+1,
+								 &info,
+								 HASH_ELEM | HASH_BLOBS);
+		lfc_ctl->size = 0;
+		dlist_init(&lfc_ctl->lru);
+
+		/* Remove file cache on restart */
+		(void)unlink(lfc_path);
+	}
+	LWLockRelease(AddinShmemInitLock);
+}
+
+static void
+lfc_shmem_request(void)
+{
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
+	RequestNamedLWLockTranche("lfc_lock", 1);
+}
+
+bool
+lfc_check_limit_hook(int *newval, void **extra, GucSource source)
+{
+	if (*newval > lfc_max_size)
+	{
+		elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size");
+		return false;
+	}
+	return true;
+}
+
+void
+lfc_change_limit_hook(int newval, void *extra)
+{
+	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
+	/*
+	 * Stats collector detach shared memory, so we should not try to access shared memory here.
+	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 */
+	if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
+		return;
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+			return;
+		}
+	}
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru))
+	{
+		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
+		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+		Assert(victim->access_count == 0);
+#ifdef FALLOC_FL_PUNCH_HOLE
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
+			elog(LOG, "Failed to punch hole in file: %m");
+#endif
+		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
+		lfc_ctl->size -= 1;
+	}
+	elog(LOG, "set local file cache limit to %d", new_size);
+	LWLockRelease(lfc_lock);
+}
+
+void
+lfc_init(void)
+{
+	/*
+	 * In order to create our shared memory area, we have to be loaded via
+	 * shared_preload_libraries.
+	 */
+	if (!process_shared_preload_libraries_in_progress)
+		elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
+
+	DefineCustomIntVariable("neon.max_file_cache_size",
+							"Maximal size of Neon local file cache",
+							NULL,
+							&lfc_max_size,
+							0, /* disabled by default */
+							0,
+							INT_MAX,
+							PGC_POSTMASTER,
+							GUC_UNIT_MB,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("neon.file_cache_size_limit",
+							"Current limit for size of Neon local file cache",
+							NULL,
+							&lfc_size_limit,
+							0, /* disabled by default */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_MB,
+							NULL,
+							lfc_change_limit_hook,
+							NULL);
+
+	DefineCustomStringVariable("neon.file_cache_path",
+							   "Path to local file cache (can be raw device)",
+							   NULL,
+							   &lfc_path,
+							   "file.cache",
+							   PGC_POSTMASTER,
+							   0,
+							   NULL,
+							   NULL,
+							   NULL);
+
+	if (lfc_max_size == 0)
+		return;
+
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = lfc_shmem_startup;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = lfc_shmem_request;
+#else
+	lfc_shmem_request();
+#endif
+}
+
+/*
+ * Check if page is present in the cache.
+ * Returns true if page is found in local cache.
+ */
+bool
+lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool found;
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return false;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+	found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0;
+	LWLockRelease(lfc_lock);
+	return found;
+}
+
+/*
+ * Try to read page from local cache.
+ * Returns true if page is found in local cache.
+ * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache.
+ */
+bool
+lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+		 char *buffer)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool result = true;
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return false;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
+	{
+		/* Page is not cached */
+		LWLockRelease(lfc_lock);
+		return false;
+	}
+	/* Unlink entry from LRU list to pin it for the duration of IO operation */
+	if (entry->access_count++ == 0)
+		dlist_delete(&entry->lru_node);
+	LWLockRelease(lfc_lock);
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+			result = false;
+		}
+	}
+
+	if (lfc_desc > 0)
+	{
+		rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+		if (rc != BLCKSZ)
+		{
+			elog(INFO, "Failed to read file cache: %m");
+			lfc_size_limit = 0; /* disable file cache */
+			result = false;
+		}
+	}
+
+	/* Place entry to the head of LRU list */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	Assert(entry->access_count > 0);
+	if (--entry->access_count == 0)
+		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	LWLockRelease(lfc_lock);
+
+	return result;
+}
+
+/*
+ * Put page in local file cache.
+ * If cache is full then evict some other page.
+ */
+void
+lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+		  char *buffer)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return;
+
+	tag.rnode = rnode;
+	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+
+	if (found)
+	{
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
+		if (entry->access_count++ == 0)
+			dlist_delete(&entry->lru_node);
+	}
+	else
+	{
+		/*
+		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
+		 * 1. Wait until some of this operation is completed and pages is unpinned
+		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
+		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
+		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
+		 * we prefer not to complicate code and use second approach.
+		 */
+		if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
+		{
+			/* Cache overflow: evict least recently used chunk */
+			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
+			Assert(victim->access_count == 0);
+			entry->offset = victim->offset; /* grab victim's chunk */
+			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
+			elog(LOG, "Swap file cache page");
+		}
+		else
+			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
+		entry->access_count = 1;
+		memset(entry->bitmap, 0, sizeof entry->bitmap);
+	}
+	LWLockRelease(lfc_lock);
+
+	/* Open cache file if not done yet */
+	if (lfc_desc == 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			lfc_size_limit = 0; /* disable file cache */
+		}
+	}
+	if (lfc_desc > 0)
+	{
+		rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+		if (rc != BLCKSZ)
+		{
+			elog(INFO, "Failed to write file cache: %m");
+			lfc_size_limit = 0; /* disable file cache */
+		}
+	}
+	/* Place entry to the head of LRU list */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	Assert(entry->access_count > 0);
+	if (--entry->access_count == 0)
+		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	if (lfc_size_limit != 0)
+		entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
+	LWLockRelease(lfc_lock);
+}
+
+
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+	uint32		pageoffs;
+	Oid			relfilenode;
+	Oid			reltablespace;
+	Oid			reldatabase;
+	ForkNumber	forknum;
+	BlockNumber blocknum;
+	uint16		accesscount;
+} LocalCachePagesRec;
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+	TupleDesc	tupdesc;
+	LocalCachePagesRec *record;
+} LocalCachePagesContext;
+
+/*
+ * Function returning data from the local file cache
+ * relation node/tablespace/database/blocknum and access_counter
+ */
+PG_FUNCTION_INFO_V1(local_cache_pages);
+
+#define NUM_LOCALCACHE_PAGES_ELEM	7
+
+Datum
+local_cache_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	Datum		result;
+	MemoryContext oldcontext;
+	LocalCachePagesContext *fctx;	/* User function context. */
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+        HASH_SEQ_STATUS status;
+		FileCacheEntry* entry;
+		uint32 n_pages = 0;
+		uint32 i;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* Create a user function context for cross-call persistence */
+		fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext));
+
+		/*
+		 * To smoothly support upgrades from version 1.0 of this extension
+		 * transparently handle the (non-)existence of the pinning_backends
+		 * column. We unfortunately have to get the result type for that... -
+		 * we can't use the result type determined by the function definition
+		 * without potentially crashing when somebody uses the old (or even
+		 * wrong) function definition though.
+		 */
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+						   OIDOID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
+						   INT2OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount",
+						   INT4OID, -1, 0);
+
+		fctx->tupdesc = BlessTupleDesc(tupledesc);
+
+		LWLockAcquire(lfc_lock, LW_SHARED);
+
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+				n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
+		}
+		fctx->record = (LocalCachePagesRec *)
+			MemoryContextAllocHuge(CurrentMemoryContext,
+								   sizeof(LocalCachePagesRec) * n_pages);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = n_pages;
+		funcctx->user_fctx = fctx;
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		/*
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
+		 */
+		n_pages = 0;
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+			{
+				if (entry->bitmap[i >> 5] & (1 << (i & 31)))
+				{
+					fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
+					fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
+					fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
+					fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
+					fctx->record[n_pages].forknum = entry->key.forkNum;
+					fctx->record[n_pages].blocknum = entry->key.blockNum + i;
+					fctx->record[n_pages].accesscount = entry->access_count;
+					n_pages += 1;
+				}
+			}
+		}
+		Assert(n_pages == funcctx->max_calls);
+		LWLockRelease(lfc_lock);
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+		Datum		values[NUM_LOCALCACHE_PAGES_ELEM];
+		bool		nulls[NUM_LOCALCACHE_PAGES_ELEM] = {
+			false, false, false, false, false, false, false
+		};
+
+		values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs);
+		values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode);
+		values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
+		values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
+		values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
+		values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
+		values[6] = Int32GetDatum(fctx->record[i].accesscount);
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 1aba2e1ede..5f134e3924 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -516,4 +516,5 @@ pg_init_libpagestore(void)
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
 	}
+	lfc_init();
 }
diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql
index 58b98a5923..6cf111ea6a 100644
--- a/pgxn/neon/neon--1.0.sql
+++ b/pgxn/neon/neon--1.0.sql
@@ -22,3 +22,13 @@ AS 'MODULE_PATHNAME', 'backpressure_throttling_time'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
 
+CREATE FUNCTION local_cache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'local_cache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW local_cache AS
+	SELECT P.* FROM local_cache_pages() AS P
+	(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, accesscount int4);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 170a0cb72d..831756b849 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -203,4 +203,11 @@ extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumbe
 extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 
+/* functions for local file cache */
+extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_init(void);
+
+
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 900f44ca10..0b34cb3ca9 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1684,6 +1684,8 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
+	lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
@@ -1757,6 +1759,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum))
+		return false;
+
 	tag = (BufferTag) {
 		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
@@ -1899,6 +1904,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case T_NeonGetPageResponse:
 			memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
+			lfc_write(rnode, forkNum, blkno, buffer);
 			break;
 
 		case T_NeonErrorResponse:
@@ -1950,6 +1956,12 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	/* Try to read from local file cache */
+	if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
+	{
+		return;
+	}
+
 	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno);
 	neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
@@ -2111,6 +2123,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
+	lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdwrite(reln, forknum, blocknum, buffer, skipFsync);

From 1ad6e186bcb72f096eebddf022e08f01cc99d1aa Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 27 Dec 2022 12:34:48 +0400
Subject: [PATCH 074/132] Refuse ProposerElected if it is going to truncate
 correct WAL.

Prevents commit_lsn monotonicity violation (otherwise harmless).

closes https://github.com/neondatabase/neon/issues/3069
---
 safekeeper/src/safekeeper.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 2c13f81476..a70ae247b7 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -727,6 +727,24 @@ where
             return Ok(None);
         }
 
+        // This might happen in a rare race when another (old) connection from
+        // the same walproposer writes + flushes WAL after this connection
+        // already sent flush_lsn in VoteRequest. It is generally safe to
+        // proceed, but to prevent commit_lsn surprisingly going down we should
+        // either refuse the session (simpler) or skip the part we already have
+        // from the stream (can be implemented).
+        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
+            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
+                   msg.term, self.flush_lsn(), msg.start_streaming_at)
+        }
+        // Otherwise this shouldn't happen.
+        assert!(
+            msg.start_streaming_at >= self.inmem.commit_lsn,
+            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
+            msg.start_streaming_at,
+            self.inmem.commit_lsn
+        );
+
         // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
         // intersection of our history and history from msg
 

From fee8bf3a1717dd4f997e7a48fb5e3d6a8333a44b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 27 Dec 2022 15:55:33 +0400
Subject: [PATCH 075/132] Remove global_commit_lsn.

It is complicated and fragile to maintain and not really needed; update
commit_lsn locally only when we have enough WAL flushed.

ref https://github.com/neondatabase/neon/issues/3069
---
 safekeeper/src/safekeeper.rs | 41 +++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index a70ae247b7..5b1b686529 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -182,7 +182,7 @@ pub struct SafeKeeperState {
     /// All WAL segments next to one containing local_start_lsn are
     /// filled with data from the beginning.
     pub local_start_lsn: Lsn,
-    /// Part of WAL acknowledged by quorum and available locally. Always points
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
     /// to record boundary.
     pub commit_lsn: Lsn,
     /// LSN that points to the end of the last backed up segment. Useful to
@@ -501,10 +501,6 @@ impl AcceptorProposerMessage {
 /// - messages from compute (proposers) and provides replies
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
-    /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn.
-    /// Note: be careful to set only if we are sure our WAL (term history) matches
-    /// committed one.
-    pub global_commit_lsn: Lsn,
     /// LSN since the proposer safekeeper currently talking to appends WAL;
     /// determines epoch switch point.
     pub epoch_start_lsn: Lsn,
@@ -537,7 +533,6 @@ where
         }
 
         Ok(SafeKeeper {
-            global_commit_lsn: state.commit_lsn,
             epoch_start_lsn: Lsn(0),
             inmem: SafekeeperMemState {
                 commit_lsn: state.commit_lsn,
@@ -777,7 +772,6 @@ where
             // NB: on new clusters, this happens at the same time as
             // timeline_start_lsn initialization, it is taken outside to provide
             // upgrade.
-            self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn);
             self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn);
 
             // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
@@ -796,10 +790,21 @@ where
         Ok(None)
     }
 
-    /// Advance commit_lsn taking into account what we have locally
-    fn update_commit_lsn(&mut self) -> Result<()> {
-        let commit_lsn = min(self.global_commit_lsn, self.flush_lsn());
-        assert!(commit_lsn >= self.inmem.commit_lsn);
+    /// Advance commit_lsn taking into account what we have locally.
+    ///
+    /// Note: it is assumed that 'WAL we have is from the right term' check has
+    /// already been done outside.
+    fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
+        // Both peers and walproposer communicate this value, we might already
+        // have a fresher (higher) version.
+        candidate = max(candidate, self.inmem.commit_lsn);
+        let commit_lsn = min(candidate, self.flush_lsn());
+        assert!(
+            commit_lsn >= self.inmem.commit_lsn,
+            "commit_lsn monotonicity violated: old={} new={}",
+            self.inmem.commit_lsn,
+            commit_lsn
+        );
 
         self.inmem.commit_lsn = commit_lsn;
 
@@ -865,14 +870,11 @@ where
             self.wal_store.flush_wal()?;
         }
 
-        // Update global_commit_lsn
+        // Update commit_lsn.
         if msg.h.commit_lsn != Lsn(0) {
-            // We also obtain commit lsn from peers, so value arrived here might be stale (less)
-            self.global_commit_lsn = max(self.global_commit_lsn, msg.h.commit_lsn);
+            self.update_commit_lsn(msg.h.commit_lsn)?;
         }
-
         self.inmem.peer_horizon_lsn = msg.h.truncate_lsn;
-        self.update_commit_lsn()?;
 
         // Update truncate and commit LSN in control file.
         // To avoid negative impact on performance of extra fsync, do it only
@@ -904,10 +906,6 @@ where
     /// Flush WAL to disk. Return AppendResponse with latest LSNs.
     fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
         self.wal_store.flush_wal()?;
-
-        // commit_lsn can be updated because we have new flushed data locally.
-        self.update_commit_lsn()?;
-
         Ok(Some(AcceptorProposerMessage::AppendResponse(
             self.append_response(),
         )))
@@ -922,8 +920,7 @@ where
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
             if sk_info.last_log_term == self.get_epoch() {
-                self.global_commit_lsn = max(Lsn(sk_info.commit_lsn), self.global_commit_lsn);
-                self.update_commit_lsn()?;
+                self.update_commit_lsn(Lsn(sk_info.commit_lsn))?;
             }
         }
 

From f6bf7b20030c2520ee952fff78442d77f19506c1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 27 Dec 2022 16:33:19 +0400
Subject: [PATCH 076/132] Add tenant_id to safekeeper spans.

Now that it's hard to map timeline id into project in the console, this should
help a little.
---
 safekeeper/src/receive_wal.rs |  4 ++--
 safekeeper/src/safekeeper.rs  | 15 +++++++++------
 safekeeper/src/send_wal.rs    |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 6577e8c4d6..be7f071abb 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -52,7 +52,7 @@ impl<'pg> ReceiveWalConn<'pg> {
 
     /// Receive WAL from wal_proposer
     pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> {
-        let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered();
+        let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered();
 
         // Notify the libpq client that it's allowed to send `CopyData` messages
         self.pg_backend
@@ -69,7 +69,7 @@ impl<'pg> ReceiveWalConn<'pg> {
         let tli = match next_msg {
             ProposerAcceptorMessage::Greeting(ref greeting) => {
                 info!(
-                    "start handshake with wal proposer {} sysid {} timeline {}",
+                    "start handshake with walproposer {} sysid {} timeline {}",
                     self.peer_addr, greeting.system_id, greeting.tli,
                 );
                 let server_info = ServerInfo {
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 5b1b686529..fa973a3ede 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -634,10 +634,12 @@ where
 
         // system_id will be updated on mismatch
         if self.state.server.system_id != msg.system_id {
-            warn!(
-                "unexpected system ID arrived, got {}, expected {}",
-                msg.system_id, self.state.server.system_id
-            );
+            if self.state.server.system_id != 0 {
+                warn!(
+                    "unexpected system ID arrived, got {}, expected {}",
+                    msg.system_id, self.state.server.system_id
+                );
+            }
 
             let mut state = self.state.clone();
             state.server.system_id = msg.system_id;
@@ -648,8 +650,9 @@ where
         }
 
         info!(
-            "processed greeting from proposer {:?}, sending term {:?}",
-            msg.proposer_id, self.state.acceptor_state.term
+            "processed greeting from walproposer {}, sending term {:?}",
+            msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
+            self.state.acceptor_state.term
         );
         Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
             term: self.state.acceptor_state.term,
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index a3481430d0..a054b8fe14 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -161,7 +161,7 @@ impl ReplicationConn {
         pgb: &mut PostgresBackend,
         mut start_pos: Lsn,
     ) -> Result<()> {
-        let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered();
+        let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered();
 
         let tli = GlobalTimelines::get(spg.ttid)?;
 

From 0c7b02ebc35e07bfaad70a0c4912a21a642e8bf8 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Wed, 28 Dec 2022 09:20:01 +0200
Subject: [PATCH 077/132] Move tenant related files to tenant directory (#3214)

Related to https://github.com/neondatabase/neon/issues/3208
---
 pageserver/src/billing_metrics.rs             |  6 +--
 pageserver/src/bin/pageserver.rs              |  5 ++-
 pageserver/src/config.rs                      |  5 ++-
 pageserver/src/http/routes.rs                 | 42 +++++++++----------
 pageserver/src/lib.rs                         |  9 ++--
 pageserver/src/page_service.rs                |  4 +-
 pageserver/src/tenant.rs                      | 11 +++--
 .../{tenant_config.rs => tenant/config.rs}    |  0
 .../src/{tenant_mgr.rs => tenant/mgr.rs}      |  5 +--
 .../src/{tenant_tasks.rs => tenant/tasks.rs}  |  4 +-
 pageserver/src/tenant/timeline.rs             |  2 +-
 11 files changed, 48 insertions(+), 45 deletions(-)
 rename pageserver/src/{tenant_config.rs => tenant/config.rs} (100%)
 rename pageserver/src/{tenant_mgr.rs => tenant/mgr.rs} (99%)
 rename pageserver/src/{tenant_tasks.rs => tenant/tasks.rs} (98%)

diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
index f9d3e8553f..73e27618db 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/billing_metrics.rs
@@ -9,7 +9,7 @@ use tracing::*;
 use utils::id::TimelineId;
 
 use crate::task_mgr;
-use crate::tenant_mgr;
+use crate::tenant::mgr;
 use pageserver_api::models::TenantState;
 use utils::id::TenantId;
 
@@ -161,7 +161,7 @@ pub async fn collect_metrics_task(
     );
 
     // get list of tenants
-    let tenants = tenant_mgr::list_tenants().await;
+    let tenants = mgr::list_tenants().await;
 
     // iterate through list of Active tenants and collect metrics
     for (tenant_id, tenant_state) in tenants {
@@ -169,7 +169,7 @@ pub async fn collect_metrics_task(
             continue;
         }
 
-        let tenant = tenant_mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
 
         let mut tenant_resident_size = 0;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d12063f5aa..2b4dcc68f0 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,7 +18,8 @@ use pageserver::{
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
     },
-    tenant_mgr, virtual_file,
+    tenant::mgr,
+    virtual_file,
 };
 use utils::{
     auth::JwtAuth,
@@ -284,7 +285,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     let remote_storage = create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    BACKGROUND_RUNTIME.block_on(tenant_mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
 
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 66f8a9f4b8..deb79531a4 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -27,14 +27,15 @@ use utils::{
     postgres_backend::AuthType,
 };
 
+use crate::tenant::config::TenantConf;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
-use crate::tenant_config::{TenantConf, TenantConfOpt};
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
 pub mod defaults {
-    use crate::tenant_config::defaults::*;
+    use crate::tenant::config::defaults::*;
     use const_format::formatcp;
 
     pub use pageserver_api::{
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6d97f3206e..66a1607801 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,9 +12,9 @@ use super::models::{
     TimelineCreateRequest, TimelineInfo,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{with_ondemand_download, Timeline};
-use crate::tenant_config::TenantConfOpt;
-use crate::{config::PageServerConf, tenant_mgr};
+use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
     auth::JwtAuth,
     http::{
@@ -170,7 +170,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     match tenant.create_timeline(
@@ -199,7 +199,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     check_permission(&request, Some(tenant_id))?;
 
     let response_data = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
             .await
             .map_err(ApiError::NotFound)?;
         let timelines = tenant.list_timelines();
@@ -262,7 +262,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     check_permission(&request, Some(tenant_id))?;
 
     let timeline_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        let tenant = mgr::get_tenant(tenant_id, true)
             .await
             .map_err(ApiError::NotFound)?;
 
@@ -294,7 +294,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
-    let timeline = tenant_mgr::get_tenant(tenant_id, true)
+    let timeline = mgr::get_tenant(tenant_id, true)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
@@ -322,7 +322,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
 
     if let Some(remote_storage) = &state.remote_storage {
         // FIXME: distinguish between "Tenant already exists" and other errors
-        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
             .map_err(ApiError::InternalServerError)?;
@@ -340,7 +340,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    tenant_mgr::delete_timeline(tenant_id, timeline_id)
+    mgr::delete_timeline(tenant_id, timeline_id)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await
         // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -357,7 +357,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
 
     let state = get_state(&request);
     let conf = state.conf;
-    tenant_mgr::detach_tenant(conf, tenant_id)
+    mgr::detach_tenant(conf, tenant_id)
         .instrument(info_span!("tenant_detach", tenant = %tenant_id))
         .await
         // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
@@ -372,7 +372,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     check_permission(&request, Some(tenant_id))?;
 
     let state = get_state(&request);
-    tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
         .instrument(info_span!("load", tenant = %tenant_id))
         .await
         .map_err(ApiError::InternalServerError)?;
@@ -386,7 +386,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
 
     let state = get_state(&request);
     let conf = state.conf;
-    tenant_mgr::ignore_tenant(conf, tenant_id)
+    mgr::ignore_tenant(conf, tenant_id)
         .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
         .await
         // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
@@ -399,7 +399,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
-    let response_data = tenant_mgr::list_tenants()
+    let response_data = mgr::list_tenants()
         .instrument(info_span!("tenant_list"))
         .await
         .iter()
@@ -419,7 +419,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     check_permission(&request, Some(tenant_id))?;
 
     let tenant_info = async {
-        let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;
 
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
@@ -446,7 +446,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -567,7 +567,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
 
     let state = get_state(&request);
 
-    let new_tenant = tenant_mgr::create_tenant(
+    let new_tenant = mgr::create_tenant(
         state.conf,
         tenant_conf,
         target_tenant_id,
@@ -669,7 +669,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
     }
 
     let state = get_state(&request);
-    tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
+    mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
         .instrument(info_span!("tenant_config", tenant = ?tenant_id))
         .await
         // FIXME: `update_tenant_config` can fail because of both user and internal errors.
@@ -721,7 +721,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -738,7 +738,7 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
@@ -759,7 +759,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
@@ -784,7 +784,7 @@ async fn timeline_download_remote_layers_handler_post(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
@@ -803,7 +803,7 @@ async fn timeline_download_remote_layers_handler_get(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+    let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
     let timeline = tenant
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ae815fe421..80b05a76a6 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,9 +13,7 @@ pub mod profiling;
 pub mod repository;
 pub mod task_mgr;
 pub mod tenant;
-pub mod tenant_config;
-pub mod tenant_mgr;
-pub mod tenant_tasks;
+
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
@@ -25,9 +23,8 @@ pub mod walredo;
 
 use std::path::Path;
 
-use tracing::info;
-
 use crate::task_mgr::TaskKind;
+use tracing::info;
 
 /// Current storage format version
 ///
@@ -56,7 +53,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
 
     // Shut down all the tenants. This flushes everything to disk and kills
     // the checkpoint and GC tasks.
-    tenant_mgr::shutdown_all_tenants().await;
+    tenant::mgr::shutdown_all_tenants().await;
 
     // Stop syncing with remote storage.
     //
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 9b52fdaf68..b84b2694f4 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -48,8 +48,8 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, Timeline};
-use crate::tenant_mgr;
 use crate::trace::Tracer;
 
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -948,7 +948,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
 async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
-    let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;
     match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
         Ok(wait_result) => wait_result
             // no .context(), the error message is good enough and some tests depend on it
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 308130c799..eb28e6da0a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -55,11 +55,12 @@ use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
-use crate::tenant_config::TenantConfOpt;
+
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
@@ -84,6 +85,10 @@ mod par_fsync;
 pub mod storage_layer;
 mod storage_sync;
 
+pub mod config;
+pub mod mgr;
+pub mod tasks;
+
 mod timeline;
 
 pub mod size;
@@ -1422,7 +1427,7 @@ impl Tenant {
 
                     // Spawn gc and compaction loops. The loops will shut themselves
                     // down when they notice that the tenant is inactive.
-                    crate::tenant_tasks::start_background_loops(self.tenant_id);
+                    tasks::start_background_loops(self.tenant_id);
 
                     for timeline in not_broken_timelines {
                         timeline.set_state(TimelineState::Active);
@@ -2576,7 +2581,7 @@ pub mod harness {
     };
 
     use super::*;
-    use crate::tenant_config::{TenantConf, TenantConfOpt};
+    use crate::tenant::config::{TenantConf, TenantConfOpt};
     use hex_literal::hex;
     use utils::id::{TenantId, TimelineId};
 
diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant/config.rs
similarity index 100%
rename from pageserver/src/tenant_config.rs
rename to pageserver/src/tenant/config.rs
diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant/mgr.rs
similarity index 99%
rename from pageserver/src/tenant_mgr.rs
rename to pageserver/src/tenant/mgr.rs
index e4e9d0c6e8..44849de735 100644
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -17,8 +17,8 @@ use utils::crashsafe;
 
 use crate::config::PageServerConf;
 use crate::task_mgr::{self, TaskKind};
+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_config::TenantConfOpt;
 use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::fs_ext::PathExt;
@@ -216,8 +216,7 @@ pub async fn create_tenant(
         hash_map::Entry::Vacant(v) => {
             // Hold the write_tenants() lock, since all of this is local IO.
             // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory =
-                super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
+            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
             let created_tenant =
                 schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
             let crated_tenant_id = created_tenant.tenant_id();
diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant/tasks.rs
similarity index 98%
rename from pageserver/src/tenant_tasks.rs
rename to pageserver/src/tenant/tasks.rs
index d71f244725..8397d26e5d 100644
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -8,8 +8,8 @@ use std::time::Duration;
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
-use crate::tenant_mgr;
 use tracing::*;
 use utils::id::TenantId;
 
@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
     wait: Duration,
 ) -> ControlFlow<(), Arc<Tenant>> {
     let tenant = loop {
-        match tenant_mgr::get_tenant(tenant_id, false).await {
+        match mgr::get_tenant(tenant_id, false).await {
             Ok(tenant) => break tenant,
             Err(e) => {
                 error!("Failed to get a tenant {tenant_id}: {e:#}");
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 55ede57e53..bbfcad5734 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -42,7 +42,7 @@ use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
-use crate::tenant_config::TenantConfOpt;
+use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 
 use postgres_connection::PgConnectionConfig;

From 172c7e5f92d03f3a78265771b465ad28b7615e43 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Wed, 28 Dec 2022 15:12:06 +0200
Subject: [PATCH 078/132] Split upload queue code from storage_sync.rs (#3216)

https://github.com/neondatabase/neon/issues/3208
---
 pageserver/src/lib.rs                         |   1 -
 pageserver/src/tenant.rs                      |  22 +-
 ...rage_sync.rs => remote_timeline_client.rs} | 259 ++----------------
 .../delete.rs                                 |   0
 .../download.rs                               |   0
 .../index.rs                                  |   0
 .../upload.rs                                 |   2 +-
 .../src/tenant/storage_layer/remote_layer.rs  |   2 +-
 pageserver/src/tenant/timeline.rs             |   8 +-
 pageserver/src/tenant/upload_queue.rs         | 213 ++++++++++++++
 10 files changed, 262 insertions(+), 245 deletions(-)
 rename pageserver/src/tenant/{storage_sync.rs => remote_timeline_client.rs} (85%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/delete.rs (100%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/download.rs (100%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/index.rs (100%)
 rename pageserver/src/tenant/{storage_sync => remote_timeline_client}/upload.rs (97%)
 create mode 100644 pageserver/src/tenant/upload_queue.rs

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 80b05a76a6..29050a5bc2 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -13,7 +13,6 @@ pub mod profiling;
 pub mod repository;
 pub mod task_mgr;
 pub mod tenant;
-
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eb28e6da0a..4c93490177 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,9 +45,7 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::metadata::TimelineMetadata;
-use self::storage_sync::create_remote_timeline_client;
-use self::storage_sync::index::IndexPart;
-use self::storage_sync::RemoteTimelineClient;
+use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
 use crate::import_datadir;
 use crate::is_uninit_mark;
@@ -57,6 +55,7 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
@@ -82,12 +81,13 @@ pub mod layer_map;
 
 pub mod metadata;
 mod par_fsync;
+mod remote_timeline_client;
 pub mod storage_layer;
-mod storage_sync;
 
 pub mod config;
 pub mod mgr;
 pub mod tasks;
+pub mod upload_queue;
 
 mod timeline;
 
@@ -648,8 +648,12 @@ impl Tenant {
             .as_ref()
             .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
 
-        let remote_timelines =
-            storage_sync::list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?;
+        let remote_timelines = remote_timeline_client::list_remote_timelines(
+            remote_storage,
+            self.conf,
+            self.tenant_id,
+        )
+        .await?;
 
         info!("found {} timelines", remote_timelines.len());
 
@@ -733,7 +737,7 @@ impl Tenant {
             .context("Failed to create new timeline directory")?;
 
         let remote_client =
-            create_remote_timeline_client(remote_storage, self.conf, self.tenant_id, timeline_id)?;
+            RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?;
 
         let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
             let timelines = self.timelines.lock().unwrap();
@@ -995,7 +999,7 @@ impl Tenant {
             .remote_storage
             .as_ref()
             .map(|remote_storage| {
-                create_remote_timeline_client(
+                RemoteTimelineClient::new(
                     remote_storage.clone(),
                     self.conf,
                     self.tenant_id,
@@ -2192,7 +2196,7 @@ impl Tenant {
         let tenant_id = self.tenant_id;
 
         let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = create_remote_timeline_client(
+            let remote_client = RemoteTimelineClient::new(
                 remote_storage.clone(),
                 self.conf,
                 tenant_id,
diff --git a/pageserver/src/tenant/storage_sync.rs b/pageserver/src/tenant/remote_timeline_client.rs
similarity index 85%
rename from pageserver/src/tenant/storage_sync.rs
rename to pageserver/src/tenant/remote_timeline_client.rs
index ef57f91a02..e27b0a8133 100644
--- a/pageserver/src/tenant/storage_sync.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -58,7 +58,7 @@
 //! To have a consistent remote structure, it's important that uploads and
 //! deletions are performed in the right order. For example, the index file
 //! contains a list of layer files, so it must not be uploaded until all the
-//! layer files that are in its list have been succesfully uploaded.
+//! layer files that are in its list have been successfully uploaded.
 //!
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
@@ -140,7 +140,7 @@
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
 //! the file is leaked in the remote storage. Similarly, if a new file is created
-//! and uploaded, but the pageserver dies permantently before updating the
+//! and uploaded, but the pageserver dies permanently before updating the
 //! remote index file, the new file is leaked in remote storage. We accept and
 //! tolerate that for now.
 //! Note further that we cannot easily fix this by scheduling deletes for every
@@ -207,30 +207,30 @@ mod upload;
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 
-use std::collections::{HashMap, VecDeque};
-use std::fmt::Debug;
-use std::ops::DerefMut;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
+use std::ops::DerefMut;
 use tokio::runtime::Runtime;
 use tracing::{info, warn};
 use tracing::{info_span, Instrument};
-
 use utils::lsn::Lsn;
 
 use crate::metrics::RemoteOpFileKind;
 use crate::metrics::RemoteOpKind;
 use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
-use crate::tenant::storage_sync::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
     config::PageServerConf,
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::BACKGROUND_RUNTIME,
     tenant::metadata::TimelineMetadata,
+    tenant::upload_queue::{
+        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
+    },
     {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };
 
@@ -286,206 +286,30 @@ pub struct RemoteTimelineClient {
     storage_impl: GenericRemoteStorage,
 }
 
-// clippy warns that Uninitialized is much smaller than Initialized, which wastes
-// memory for Uninitialized variants. Doesn't matter in practice, there are not
-// that many upload queues in a running pageserver, and most of them are initialized
-// anyway.
-#[allow(clippy::large_enum_variant)]
-enum UploadQueue {
-    Uninitialized,
-    Initialized(UploadQueueInitialized),
-    Stopped(UploadQueueStopped),
-}
-
-impl UploadQueue {
-    fn as_str(&self) -> &'static str {
-        match self {
-            UploadQueue::Uninitialized => "Uninitialized",
-            UploadQueue::Initialized(_) => "Initialized",
-            UploadQueue::Stopped(_) => "Stopped",
-        }
-    }
-}
-
-/// This keeps track of queued and in-progress tasks.
-struct UploadQueueInitialized {
-    /// Counter to assign task IDs
-    task_counter: u64,
-
-    /// All layer files stored in the remote storage, taking into account all
-    /// in-progress and queued operations
-    latest_files: HashMap<LayerFileName, LayerFileMetadata>,
-
-    /// How many file uploads or deletions been scheduled, since the
-    /// last (scheduling of) metadata index upload?
-    latest_files_changes_since_metadata_upload_scheduled: u64,
-
-    /// Metadata stored in the remote storage, taking into account all
-    /// in-progress and queued operations.
-    /// DANGER: do not return to outside world, e.g., safekeepers.
-    latest_metadata: TimelineMetadata,
-
-    /// `disk_consistent_lsn` from the last metadata file that was successfully
-    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
-    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
-    /// Safekeeper can rely on it to make decisions for WAL storage.
-    last_uploaded_consistent_lsn: Lsn,
-
-    // Breakdown of different kinds of tasks currently in-progress
-    num_inprogress_layer_uploads: usize,
-    num_inprogress_metadata_uploads: usize,
-    num_inprogress_deletions: usize,
-
-    /// Tasks that are currently in-progress. In-progress means that a tokio Task
-    /// has been launched for it. An in-progress task can be busy uploading, but it can
-    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
-    /// be waiting for retry in `exponential_backoff`.
-    inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
-
-    /// Queued operations that have not been launched yet. They might depend on previous
-    /// tasks to finish. For example, metadata upload cannot be performed before all
-    /// preceding layer file uploads have completed.
-    queued_operations: VecDeque<UploadOp>,
-}
-
-struct UploadQueueStopped {
-    last_uploaded_consistent_lsn: Lsn,
-}
-
-impl UploadQueue {
-    fn initialize_empty_remote(
-        &mut self,
-        metadata: &TimelineMetadata,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        info!("initializing upload queue for empty remote");
-
-        let state = UploadQueueInitialized {
-            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: HashMap::new(),
-            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: metadata.clone(),
-            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
-            // safekeepers from garbage-collecting anything.
-            last_uploaded_consistent_lsn: Lsn(0),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialize_with_current_remote_index_part(
-        &mut self,
-        index_part: &IndexPart,
-    ) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized => (),
-            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
-                anyhow::bail!("already initialized, state {}", self.as_str())
-            }
-        }
-
-        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
-        for layer_name in &index_part.timeline_layers {
-            let layer_metadata = index_part
-                .layer_metadata
-                .get(layer_name)
-                .map(LayerFileMetadata::from)
-                .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(layer_name.to_owned(), layer_metadata);
-        }
-
-        let index_part_metadata = index_part.parse_metadata()?;
-        info!(
-            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part_metadata.disk_consistent_lsn()
-        );
-
-        let state = UploadQueueInitialized {
-            latest_files: files,
-            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part_metadata.clone(),
-            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
-            // what follows are boring default initializations
-            task_counter: 0,
-            num_inprogress_layer_uploads: 0,
-            num_inprogress_metadata_uploads: 0,
-            num_inprogress_deletions: 0,
-            inprogress_tasks: HashMap::new(),
-            queued_operations: VecDeque::new(),
-        };
-
-        *self = UploadQueue::Initialized(state);
-        Ok(self.initialized_mut().expect("we just set it"))
-    }
-
-    fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
-        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => Ok(x),
-        }
-    }
-}
-
-/// An in-progress upload or delete task.
-#[derive(Debug)]
-struct UploadTask {
-    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
-    task_id: u64,
-    retries: AtomicU32,
-
-    op: UploadOp,
-}
-
-#[derive(Debug)]
-enum UploadOp {
-    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
-
-    /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
-
-    /// Delete a file.
-    Delete(RemoteOpFileKind, LayerFileName),
-
-    /// Barrier. When the barrier operation is reached,
-    Barrier(tokio::sync::watch::Sender<()>),
-}
-
-impl std::fmt::Display for UploadOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            UploadOp::UploadLayer(path, metadata) => {
-                write!(
-                    f,
-                    "UploadLayer({}, size={:?})",
-                    path.file_name(),
-                    metadata.file_size()
-                )
-            }
-            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
-            UploadOp::Barrier(_) => write!(f, "Barrier"),
-        }
-    }
-}
-
 impl RemoteTimelineClient {
+    ///
+    /// Create a remote storage client for given timeline
+    ///
+    /// Note: the caller must initialize the upload queue before any uploads can be scheduled,
+    /// by calling init_upload_queue.
+    ///
+    pub fn new(
+        remote_storage: GenericRemoteStorage,
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<RemoteTimelineClient> {
+        Ok(RemoteTimelineClient {
+            conf,
+            runtime: &BACKGROUND_RUNTIME,
+            tenant_id,
+            timeline_id,
+            storage_impl: remote_storage,
+            upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
+        })
+    }
+
     /// Initialize the upload queue for a remote storage that already received
     /// an index file upload, i.e., it's not empty.
     /// The given `index_part` must be the one on the remote.
@@ -1156,29 +980,6 @@ impl RemoteTimelineClient {
     }
 }
 
-///
-/// Create a remote storage client for given timeline
-///
-/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
-/// by calling init_upload_queue.
-///
-pub fn create_remote_timeline_client(
-    remote_storage: GenericRemoteStorage,
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<RemoteTimelineClient> {
-    Ok(RemoteTimelineClient {
-        conf,
-        runtime: &BACKGROUND_RUNTIME,
-        tenant_id,
-        timeline_id,
-        storage_impl: remote_storage,
-        upload_queue: Mutex::new(UploadQueue::Uninitialized),
-        metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
-    })
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/storage_sync/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs
similarity index 100%
rename from pageserver/src/tenant/storage_sync/delete.rs
rename to pageserver/src/tenant/remote_timeline_client/delete.rs
diff --git a/pageserver/src/tenant/storage_sync/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
similarity index 100%
rename from pageserver/src/tenant/storage_sync/download.rs
rename to pageserver/src/tenant/remote_timeline_client/download.rs
diff --git a/pageserver/src/tenant/storage_sync/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
similarity index 100%
rename from pageserver/src/tenant/storage_sync/index.rs
rename to pageserver/src/tenant/remote_timeline_client/index.rs
diff --git a/pageserver/src/tenant/storage_sync/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
similarity index 97%
rename from pageserver/src/tenant/storage_sync/upload.rs
rename to pageserver/src/tenant/remote_timeline_client/upload.rs
index 08cea6268b..5082fa1634 100644
--- a/pageserver/src/tenant/storage_sync/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,7 +5,7 @@ use fail::fail_point;
 use std::path::Path;
 use tokio::fs;
 
-use crate::{config::PageServerConf, tenant::storage_sync::index::IndexPart};
+use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};
 
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index c2c11d7bff..33474bb4a2 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -3,8 +3,8 @@
 //!
 use crate::config::PageServerConf;
 use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::storage_sync::index::LayerFileMetadata;
 use anyhow::{bail, Result};
 use std::ops::Range;
 use std::path::PathBuf;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bbfcad5734..93eb643d12 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,11 +23,11 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
     RemoteLayer,
 };
-use crate::tenant::storage_sync::{self, index::LayerFileMetadata};
 use crate::tenant::{
     ephemeral_file::is_ephemeral_file,
     layer_map::{LayerMap, SearchResult},
@@ -64,9 +64,9 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
 
+use super::remote_timeline_client::index::IndexPart;
+use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{DeltaLayer, ImageLayer, Layer};
-use super::storage_sync::index::IndexPart;
-use super::storage_sync::RemoteTimelineClient;
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 enum FlushLoopState {
@@ -1122,7 +1122,7 @@ impl Timeline {
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
-            } else if storage_sync::is_temp_download_file(&direntry_path) {
+            } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
                 info!(
                     "skipping temp download file, reconcile_with_remote will resume / clean up: {}",
                     fname
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
new file mode 100644
index 0000000000..790b2f59aa
--- /dev/null
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -0,0 +1,213 @@
+use crate::metrics::RemoteOpFileKind;
+
+use super::storage_layer::LayerFileName;
+use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use std::collections::{HashMap, VecDeque};
+use std::fmt::Debug;
+
+use std::sync::Arc;
+use tracing::info;
+
+use std::sync::atomic::AtomicU32;
+use utils::lsn::Lsn;
+
+// clippy warns that Uninitialized is much smaller than Initialized, which wastes
+// memory for Uninitialized variants. Doesn't matter in practice, there are not
+// that many upload queues in a running pageserver, and most of them are initialized
+// anyway.
+#[allow(clippy::large_enum_variant)]
+pub(crate) enum UploadQueue {
+    Uninitialized,
+    Initialized(UploadQueueInitialized),
+    Stopped(UploadQueueStopped),
+}
+
+impl UploadQueue {
+    fn as_str(&self) -> &'static str {
+        match self {
+            UploadQueue::Uninitialized => "Uninitialized",
+            UploadQueue::Initialized(_) => "Initialized",
+            UploadQueue::Stopped(_) => "Stopped",
+        }
+    }
+}
+
+/// This keeps track of queued and in-progress tasks.
+pub(crate) struct UploadQueueInitialized {
+    /// Counter to assign task IDs
+    pub(crate) task_counter: u64,
+
+    /// All layer files stored in the remote storage, taking into account all
+    /// in-progress and queued operations
+    pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+
+    /// How many file uploads or deletions been scheduled, since the
+    /// last (scheduling of) metadata index upload?
+    pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
+
+    /// Metadata stored in the remote storage, taking into account all
+    /// in-progress and queued operations.
+    /// DANGER: do not return to outside world, e.g., safekeepers.
+    pub(crate) latest_metadata: TimelineMetadata,
+
+    /// `disk_consistent_lsn` from the last metadata file that was successfully
+    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
+    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
+    /// Safekeeper can rely on it to make decisions for WAL storage.
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+
+    // Breakdown of different kinds of tasks currently in-progress
+    pub(crate) num_inprogress_layer_uploads: usize,
+    pub(crate) num_inprogress_metadata_uploads: usize,
+    pub(crate) num_inprogress_deletions: usize,
+
+    /// Tasks that are currently in-progress. In-progress means that a tokio Task
+    /// has been launched for it. An in-progress task can be busy uploading, but it can
+    /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
+    /// be waiting for retry in `exponential_backoff`.
+    pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
+
+    /// Queued operations that have not been launched yet. They might depend on previous
+    /// tasks to finish. For example, metadata upload cannot be performed before all
+    /// preceding layer file uploads have completed.
+    pub(crate) queued_operations: VecDeque<UploadOp>,
+}
+
+pub(crate) struct UploadQueueStopped {
+    pub(crate) last_uploaded_consistent_lsn: Lsn,
+}
+
+impl UploadQueue {
+    pub(crate) fn initialize_empty_remote(
+        &mut self,
+        metadata: &TimelineMetadata,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        info!("initializing upload queue for empty remote");
+
+        let state = UploadQueueInitialized {
+            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
+            latest_files: HashMap::new(),
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: metadata.clone(),
+            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
+            // safekeepers from garbage-collecting anything.
+            last_uploaded_consistent_lsn: Lsn(0),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialize_with_current_remote_index_part(
+        &mut self,
+        index_part: &IndexPart,
+    ) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized => (),
+            UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
+                anyhow::bail!("already initialized, state {}", self.as_str())
+            }
+        }
+
+        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
+        for layer_name in &index_part.timeline_layers {
+            let layer_metadata = index_part
+                .layer_metadata
+                .get(layer_name)
+                .map(LayerFileMetadata::from)
+                .unwrap_or(LayerFileMetadata::MISSING);
+            files.insert(layer_name.to_owned(), layer_metadata);
+        }
+
+        let index_part_metadata = index_part.parse_metadata()?;
+        info!(
+            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
+            index_part_metadata.disk_consistent_lsn()
+        );
+
+        let state = UploadQueueInitialized {
+            latest_files: files,
+            latest_files_changes_since_metadata_upload_scheduled: 0,
+            latest_metadata: index_part_metadata.clone(),
+            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
+            // what follows are boring default initializations
+            task_counter: 0,
+            num_inprogress_layer_uploads: 0,
+            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
+        };
+
+        *self = UploadQueue::Initialized(state);
+        Ok(self.initialized_mut().expect("we just set it"))
+    }
+
+    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        match self {
+            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Initialized(x) => Ok(x),
+        }
+    }
+}
+
+/// An in-progress upload or delete task.
+#[derive(Debug)]
+pub(crate) struct UploadTask {
+    /// Unique ID of this task. Used as the key in `inprogress_tasks` above.
+    pub(crate) task_id: u64,
+    pub(crate) retries: AtomicU32,
+
+    pub(crate) op: UploadOp,
+}
+
+#[derive(Debug)]
+pub(crate) enum UploadOp {
+    /// Upload a layer file
+    UploadLayer(LayerFileName, LayerFileMetadata),
+
+    /// Upload the metadata file
+    UploadMetadata(IndexPart, Lsn),
+
+    /// Delete a file.
+    Delete(RemoteOpFileKind, LayerFileName),
+
+    /// Barrier. When the barrier operation is reached,
+    Barrier(tokio::sync::watch::Sender<()>),
+}
+
+impl std::fmt::Display for UploadOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            UploadOp::UploadLayer(path, metadata) => {
+                write!(
+                    f,
+                    "UploadLayer({}, size={:?})",
+                    path.file_name(),
+                    metadata.file_size()
+                )
+            }
+            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
+            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Barrier(_) => write!(f, "Barrier"),
+        }
+    }
+}

From 42c6ddef8edcd1e0dae8cba5a4c8c1f2b3d70589 Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Wed, 28 Dec 2022 20:52:24 +0200
Subject: [PATCH 079/132] Rename ZENITH_AUTH_TOKEN to NEON_AUTH_TOKEN

Changes are:

* Pageserver: start reading from NEON_AUTH_TOKEN by default.
  Warn if ZENITH_AUTH_TOKEN is used instead.
* Compute, Docs: fix the default token name.
* Control plane: change name of the token in configs and start
  sequences.

Compatibility:

* Control plane in tests: works, no compatibility expected.
* Control plane for local installations: never officially supported
  auth anyways. If someone did enable it, `pageserver.toml` should be updated
  with the new `neon.pageserver_connstring` and `neon.safekeeper_token_env`.
* Pageserver is backward compatible: you can run new Pageserver with old
  commands and environment configurations, but not vice-versa.
  The culprit is the hard-coded `NEON_AUTH_TOKEN`.
* Compute has no code changes. As long as you update its configuration
  file with `pageserver_connstring` in sync with the start up scripts,
  you are good to go.
* Safekeeper has no code changes and has never used `ZENITH_AUTH_TOKEN` in
  the first place.
---
 control_plane/src/compute.rs     | 12 ++++++------
 control_plane/src/pageserver.rs  |  2 +-
 docs/authentication.md           |  6 +++---
 pageserver/src/bin/pageserver.rs | 26 +++++++++++++++++++++-----
 pgxn/neon/libpagestore.c         |  2 +-
 5 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs
index 547aa14d39..8731cf2583 100644
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -201,7 +201,7 @@ impl PostgresNode {
             .stderr(Stdio::piped());
 
         if let Some(token) = auth_token {
-            cmd.env("ZENITH_AUTH_TOKEN", token);
+            cmd.env("NEON_AUTH_TOKEN", token);
         }
 
         let sync_handle = cmd
@@ -304,17 +304,17 @@ impl PostgresNode {
 
             // Set up authentication
             //
-            // $ZENITH_AUTH_TOKEN will be replaced with value from environment
+            // $NEON_AUTH_TOKEN will be replaced with value from environment
             // variable during compute pg startup. It is done this way because
             // otherwise user will be able to retrieve the value using SHOW
             // command or pg_settings
             let password = if let AuthType::NeonJWT = auth_type {
-                "$ZENITH_AUTH_TOKEN"
+                "$NEON_AUTH_TOKEN"
             } else {
                 ""
             };
             // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere.
-            // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN
+            // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN
             // We parse this string and build it back with token from env var, and for simplicity rebuild
             // uses only needed variables namely host, port, user, password.
             format!("postgresql://no_user:{password}@{host}:{port}")
@@ -323,7 +323,7 @@ impl PostgresNode {
         conf.append_line("");
         conf.append("neon.pageserver_connstring", &pageserver_connstr);
         if let AuthType::NeonJWT = auth_type {
-            conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN");
+            conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN");
         }
         conf.append("neon.tenant_id", &self.tenant_id.to_string());
         conf.append("neon.timeline_id", &self.timeline_id.to_string());
@@ -448,7 +448,7 @@ impl PostgresNode {
             self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(),
         );
         if let Some(token) = auth_token {
-            cmd.env("ZENITH_AUTH_TOKEN", token);
+            cmd.env("NEON_AUTH_TOKEN", token);
         }
 
         let pg_ctl = cmd.output().context("pg_ctl failed")?;
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 0c2415965a..68e94b2fdc 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -320,7 +320,7 @@ impl PageServerNode {
             let token = self
                 .env
                 .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
-            vec![("ZENITH_AUTH_TOKEN".to_owned(), token)]
+            vec![("NEON_AUTH_TOKEN".to_owned(), token)]
         } else {
             Vec::new()
         })
diff --git a/docs/authentication.md b/docs/authentication.md
index 0752fae19f..e22d7b700f 100644
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL.
 
 #### Outgoing connections
 Compute connects to Pageserver for getting pages.
-The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`.
+The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`.
 The environment variable inside the connection string is substituted with
 the JWT token.
 
@@ -77,7 +77,7 @@ If the GUC is unset, no token is passed.
 
 Note that both tokens can be (and typically are) the same;
 the scope is the tenant and the token is usually passed through the
-`$ZENITH_AUTH_TOKEN` environment variable.
+`$NEON_AUTH_TOKEN` environment variable.
 
 ### Pageserver
 #### Overview
@@ -114,7 +114,7 @@ either of three values:
 Pageserver makes a connection to a Safekeeper for each active timeline.
 As Pageserver may want to access any timeline it has on the disk,
 it is given a blanket JWT token to access any data on any Safekeeper.
-This token is passed through an environment variable called `ZENITH_AUTH_TOKEN`
+This token is passed through an environment variable called `NEON_AUTH_TOKEN`
 (non-configurable as of writing this text).
 
 A better way _may be_ to store JWT token for each timeline next to it,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 2b4dcc68f0..5246541375 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -264,19 +264,35 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     };
     info!("Using auth: {:#?}", conf.auth_type);
 
-    match var("ZENITH_AUTH_TOKEN") {
-        Ok(v) => {
+    // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
+    match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
+        (old, Ok(v)) => {
             info!("Loaded JWT token for authentication with Safekeeper");
+            if let Ok(v_old) = old {
+                warn!(
+                    "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
+                );
+                if v_old != v {
+                    warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
+                }
+            }
             pageserver::config::SAFEKEEPER_AUTH_TOKEN
                 .set(Arc::new(v))
                 .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
         }
-        Err(VarError::NotPresent) => {
+        (Ok(v), _) => {
+            info!("Loaded JWT token for authentication with Safekeeper");
+            warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
+            pageserver::config::SAFEKEEPER_AUTH_TOKEN
+                .set(Arc::new(v))
+                .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
+        }
+        (_, Err(VarError::NotPresent)) => {
             info!("No JWT token for authentication with Safekeeper detected");
         }
-        Err(e) => {
+        (_, Err(e)) => {
             return Err(e).with_context(|| {
-                "Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable"
+                "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
             })
         }
     };
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 5f134e3924..c6199dddc0 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -420,7 +420,7 @@ pg_init_libpagestore(void)
 							   NULL, NULL, NULL);
 
     DefineCustomStringVariable("neon.safekeeper_token_env",
-                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $ZENITH_AUTH_TOKEN",
+                               "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN",
                                NULL,
                                &safekeeper_token_env,
                                NULL,

From bd7a9e6274225eb1346811e661f6afb66d17a591 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 28 Dec 2022 17:48:49 +0200
Subject: [PATCH 080/132] switch to debug from info to produce less noise

---
 pageserver/src/tenant/remote_timeline_client.rs | 6 +++---
 pageserver/src/tenant/timeline.rs               | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e27b0a8133..45988ff47a 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -214,7 +214,7 @@ use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
@@ -675,7 +675,7 @@ impl RemoteTimelineClient {
             // We can launch this task. Remove it from the queue first.
             let next_op = upload_queue.queued_operations.pop_front().unwrap();
 
-            info!("starting op: {}", next_op);
+            debug!("starting op: {}", next_op);
 
             // Update the counters
             match next_op {
@@ -867,7 +867,7 @@ impl RemoteTimelineClient {
                 task.op, retries
             );
         } else {
-            info!("remote task {} completed successfully", task.op);
+            debug!("remote task {} completed successfully", task.op);
         }
 
         // The task has completed succesfully. Remove it from the in-progress list.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 93eb643d12..137c38ca85 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2593,7 +2593,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await
@@ -2807,7 +2807,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await

From f731e9b3de1089f4bc5d6fb683395eb428b516fe Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 29 Dec 2022 12:11:04 +0200
Subject: [PATCH 081/132] Fix serialization of billing metrics (#3215)

Fixes:
- serialize TenantId and TimelineId as strings,
- skip TimelineId if none
- serialize `metric_type` field as `type`
- add `idempotency_key` field to uniquely identify metrics
---
 pageserver/src/billing_metrics.rs | 50 +++++++++++++++++++++++--------
 pageserver/src/bin/pageserver.rs  |  1 +
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/billing_metrics.rs
index 73e27618db..3a6b83773d 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/billing_metrics.rs
@@ -6,6 +6,7 @@
 
 use anyhow;
 use tracing::*;
+use utils::id::NodeId;
 use utils::id::TimelineId;
 
 use crate::task_mgr;
@@ -14,12 +15,14 @@ use pageserver_api::models::TenantState;
 use utils::id::TenantId;
 
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::fmt;
 use std::str::FromStr;
 use std::time::Duration;
 
 use chrono::{DateTime, Utc};
+use rand::Rng;
 use reqwest::Url;
 
 /// BillingMetric struct that defines the format for one metric entry
@@ -30,27 +33,36 @@ use reqwest::Url;
 /// "metric": "remote_storage_size",
 /// "type": "absolute",
 /// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
-/// "timeline_id": "00000000000000000000000000000000",
-/// "time": ...,
+/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
+/// "time": "2022-12-28T11:07:19.317310284Z",
+/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
 /// "value": 12345454,
 /// }
 /// ```
+#[serde_as]
 #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct BillingMetric {
     pub metric: BillingMetricKind,
+    #[serde(rename = "type")]
     pub metric_type: &'static str,
+    #[serde_as(as = "DisplayFromStr")]
     pub tenant_id: TenantId,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
     pub timeline_id: Option<TimelineId>,
     pub time: DateTime<Utc>,
+    pub idempotency_key: String,
     pub value: u64,
 }
 
 impl BillingMetric {
-    pub fn new_absolute(
+    pub fn new_absolute<R: Rng + ?Sized>(
         metric: BillingMetricKind,
         tenant_id: TenantId,
         timeline_id: Option<TimelineId>,
         value: u64,
+        node_id: NodeId,
+        rng: &mut R,
     ) -> Self {
         Self {
             metric,
@@ -58,6 +70,8 @@ impl BillingMetric {
             tenant_id,
             timeline_id,
             time: Utc::now(),
+            // key that allows metric collector to distinguish unique events
+            idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
             value,
         }
     }
@@ -123,6 +137,7 @@ struct EventChunk<'a> {
 pub async fn collect_metrics(
     metric_collection_endpoint: &Url,
     metric_collection_interval: Duration,
+    node_id: NodeId,
 ) -> anyhow::Result<()> {
     let mut ticker = tokio::time::interval(metric_collection_interval);
 
@@ -139,7 +154,7 @@ pub async fn collect_metrics(
                 return Ok(());
             },
             _ = ticker.tick() => {
-                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint).await?;
+                collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?;
             }
         }
     }
@@ -153,6 +168,7 @@ pub async fn collect_metrics_task(
     client: &reqwest::Client,
     cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
+    node_id: NodeId,
 ) -> anyhow::Result<()> {
     let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
     trace!(
@@ -241,15 +257,23 @@ pub async fn collect_metrics_task(
 
     for chunk in chunks {
         chunk_to_send.clear();
-        // enrich metrics with timestamp and metric_kind before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
-            BillingMetric::new_absolute(
-                curr_key.metric,
-                curr_key.tenant_id,
-                curr_key.timeline_id,
-                *curr_val,
-            )
-        }));
+
+        // this code block is needed to convince compiler
+        // that rng is not reused aroung await point
+        {
+            // enrich metrics with timestamp and metric_kind before sending
+            let mut rng = rand::thread_rng();
+            chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
+                BillingMetric::new_absolute(
+                    curr_key.metric,
+                    curr_key.tenant_id,
+                    curr_key.timeline_id,
+                    *curr_val,
+                    node_id,
+                    &mut rng,
+                )
+            }));
+        }
 
         let chunk_json = serde_json::value::to_raw_value(&EventChunk {
             events: &chunk_to_send,
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5246541375..4b71874bdf 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -341,6 +341,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                     pageserver::billing_metrics::collect_metrics(
                         metric_collection_endpoint,
                         conf.metric_collection_interval,
+                        conf.id,
                     )
                     .instrument(info_span!("metrics_collection"))
                     .await?;

From 0e7c03370e9df37ee9fea7a667d6fbb2885aec08 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 29 Dec 2022 12:20:28 +0200
Subject: [PATCH 082/132] Lazy calculation of traversal_id which is needed only
 for error repoting (#3221)

See
https://neondb.slack.com/archives/C0277TKAJCA/p1672245908989789
and
https://neondb.slack.com/archives/C033RQ5SPDH/p1671885245981359
---
 pageserver/src/tenant/timeline.rs | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 137c38ca85..951f217cf9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1588,7 +1588,7 @@ trait TraversalLayerExt {
 }
 
 impl TraversalLayerExt for Arc<dyn PersistentLayer> {
-    fn traversal_id(&self) -> String {
+    fn traversal_id(&self) -> TraversalId {
         match self.local_path() {
             Some(local_path) => {
                 debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
@@ -1608,7 +1608,7 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
 }
 
 impl TraversalLayerExt for Arc<InMemoryLayer> {
-    fn traversal_id(&self) -> String {
+    fn traversal_id(&self) -> TraversalId {
         format!(
             "timeline {} in-memory {}",
             self.get_timeline_id(),
@@ -1638,7 +1638,8 @@ impl Timeline {
 
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path = Vec::<(ValueReconstructResult, Lsn, TraversalId)>::new();
+        let mut traversal_path =
+            Vec::<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>::new();
 
         let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
             *cached_lsn
@@ -1726,7 +1727,7 @@ impl Timeline {
                         Err(e) => return PageReconstructResult::from(e),
                     };
                     cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
+                    traversal_path.push((result, cont_lsn, Box::new(open_layer.clone())));
                     continue;
                 }
             }
@@ -1744,7 +1745,7 @@ impl Timeline {
                         Err(e) => return PageReconstructResult::from(e),
                     };
                     cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
+                    traversal_path.push((result, cont_lsn, Box::new(frozen_layer.clone())));
                     continue 'outer;
                 }
             }
@@ -1771,7 +1772,7 @@ impl Timeline {
                     Err(e) => return PageReconstructResult::from(e),
                 };
                 cont_lsn = lsn_floor;
-                traversal_path.push((result, cont_lsn, layer.traversal_id()));
+                traversal_path.push((result, cont_lsn, Box::new(layer.clone())));
             } else if timeline.ancestor_timeline.is_some() {
                 // Nothing on this timeline. Traverse to parent
                 result = ValueReconstructResult::Continue;
@@ -3344,7 +3345,7 @@ where
 /// to an error, as anyhow context information.
 fn layer_traversal_error(
     msg: String,
-    path: Vec<(ValueReconstructResult, Lsn, TraversalId)>,
+    path: Vec<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>,
 ) -> PageReconstructResult<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
@@ -3353,7 +3354,9 @@ fn layer_traversal_error(
         .map(|(r, c, l)| {
             format!(
                 "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r, c, l,
+                r,
+                c,
+                l.traversal_id(),
             )
         })
         .chain(std::iter::once(msg));

From c0290467fa100c7e2c147e3b804e9a45d108b0b0 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Thu, 29 Dec 2022 12:33:30 +0200
Subject: [PATCH 083/132] Fix #2907 Remove missing_layers from IndexPart
 (#3217)

#2907
---
 .../src/tenant/remote_timeline_client/index.rs    | 15 +--------------
 .../regress/test_tenants_with_remote_storage.py   | 10 ++++++----
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 017be29726..c199b7e10b 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -83,11 +83,6 @@ where
     /// Additional metadata can might exist in `layer_metadata`.
     pub timeline_layers: HashSet<L>,
 
-    /// FIXME: unused field. This should be removed, but that changes the on-disk format,
-    /// so we need to make sure we're backwards-` (and maybe forwards-) compatible
-    /// First pass is to move it to Optional and the next would be its removal
-    missing_layers: Option<HashSet<L>>,
-
     /// Per layer file name metadata, which can be present for a present or missing layer file.
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -167,8 +162,6 @@ impl IndexPartUnclean {
         let IndexPartUnclean {
             version,
             timeline_layers,
-            // this is an unused field, ignore it on cleaning
-            missing_layers: _,
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
@@ -189,7 +182,6 @@ impl IndexPartUnclean {
                     }
                 })
                 .collect(),
-            missing_layers: None,
             layer_metadata: layer_metadata
                 .into_iter()
                 .filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
@@ -225,7 +217,6 @@ impl IndexPart {
         Self {
             version: Self::LATEST_VERSION,
             timeline_layers,
-            missing_layers: Some(HashSet::new()),
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
@@ -259,7 +250,6 @@ mod tests {
     fn v0_indexpart_is_parsed() {
         let example = r#"{
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "disk_consistent_lsn":"0/16960E8",
             "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
         }"#;
@@ -267,7 +257,6 @@ mod tests {
         let expected = IndexPart {
             version: 0,
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            missing_layers: None, // disabled fields should not carry unused values further
             layer_metadata: HashMap::default(),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -283,7 +272,6 @@ mod tests {
         let example = r#"{
             "version":1,
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
                 "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
@@ -296,7 +284,6 @@ mod tests {
             // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
             version: 1,
             timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            missing_layers: None,
             layer_metadata: HashMap::from([
                 ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                     file_size: Some(25600000),
@@ -322,6 +309,7 @@ mod tests {
         let example = r#"{
             "version":1,
             "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
             "layer_metadata":{
                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
                 "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
@@ -346,7 +334,6 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            missing_layers: None,
         };
 
         let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 6a5b4278da..6da6a4d446 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -229,7 +229,7 @@ def test_tenant_upgrades_index_json_from_v0(
         "timeline_layers":[
             "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"
         ],
-        "missing_layers":[],
+        "missing_layers":["This should not fail as its not used anymore"],
         "disk_consistent_lsn":"0/16960E8",
         "metadata_bytes":[]
     }"""
@@ -261,7 +261,6 @@ def test_tenant_upgrades_index_json_from_v0(
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
     pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
     wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
-
     env.postgres.stop_all()
     env.pageserver.stop()
 
@@ -274,7 +273,10 @@ def test_tenant_upgrades_index_json_from_v0(
         # keep the deserialized for later inspection
         orig_index_part = json.load(timeline_file)
 
-        v0_index_part = {key: orig_index_part[key] for key in v0_skeleton}
+        v0_index_part = {
+            key: orig_index_part[key]
+            for key in v0_skeleton.keys() - ["missing_layers"]  # pgserver doesn't have it anymore
+        }
 
         timeline_file.seek(0)
         json.dump(v0_index_part, timeline_file)
@@ -306,7 +308,7 @@ def test_tenant_upgrades_index_json_from_v0(
     # make sure the file has been upgraded back to how it started
     index_part = local_fs_index_part(env, tenant_id, timeline_id)
     assert index_part["version"] == orig_index_part["version"]
-    assert index_part["missing_layers"] == orig_index_part["missing_layers"]
+    assert "missing_layers" not in index_part.keys()
 
     # expect one more layer because of the forced checkpoint
     assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1

From 894ac30734663f9b17645535771a47952d84b8f5 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 29 Dec 2022 12:28:58 +0200
Subject: [PATCH 084/132] Rename billing_metrics to consumption_metrics. Use
 more appropriate term, because not all of these metrics are used for billing.

---
 pageserver/src/bin/pageserver.rs              |  2 +-
 ...ling_metrics.rs => consumption_metrics.rs} | 54 +++++++++----------
 pageserver/src/lib.rs                         |  2 +-
 3 files changed, 29 insertions(+), 29 deletions(-)
 rename pageserver/src/{billing_metrics.rs => consumption_metrics.rs} (85%)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 4b71874bdf..b3d9b0f809 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -338,7 +338,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "consumption metrics collection",
                 true,
                 async move {
-                    pageserver::billing_metrics::collect_metrics(
+                    pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
                         conf.metric_collection_interval,
                         conf.id,
diff --git a/pageserver/src/billing_metrics.rs b/pageserver/src/consumption_metrics.rs
similarity index 85%
rename from pageserver/src/billing_metrics.rs
rename to pageserver/src/consumption_metrics.rs
index 3a6b83773d..0d96eb431d 100644
--- a/pageserver/src/billing_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -25,7 +25,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use reqwest::Url;
 
-/// BillingMetric struct that defines the format for one metric entry
+/// ConsumptionMetric struct that defines the format for one metric entry
 /// i.e.
 ///
 /// ```json
@@ -41,8 +41,8 @@ use reqwest::Url;
 /// ```
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub struct BillingMetric {
-    pub metric: BillingMetricKind,
+pub struct ConsumptionMetric {
+    pub metric: ConsumptionMetricKind,
     #[serde(rename = "type")]
     pub metric_type: &'static str,
     #[serde_as(as = "DisplayFromStr")]
@@ -55,9 +55,9 @@ pub struct BillingMetric {
     pub value: u64,
 }
 
-impl BillingMetric {
+impl ConsumptionMetric {
     pub fn new_absolute<R: Rng + ?Sized>(
-        metric: BillingMetricKind,
+        metric: ConsumptionMetricKind,
         tenant_id: TenantId,
         timeline_id: Option<TimelineId>,
         value: u64,
@@ -79,7 +79,7 @@ impl BillingMetric {
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
-pub enum BillingMetricKind {
+pub enum ConsumptionMetricKind {
     /// Amount of WAL produced , by a timeline, i.e. last_record_lsn
     /// This is an absolute, per-timeline metric.
     WrittenSize,
@@ -96,7 +96,7 @@ pub enum BillingMetricKind {
     RemoteStorageSize,
 }
 
-impl FromStr for BillingMetricKind {
+impl FromStr for ConsumptionMetricKind {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -110,27 +110,27 @@ impl FromStr for BillingMetricKind {
     }
 }
 
-impl fmt::Display for BillingMetricKind {
+impl fmt::Display for ConsumptionMetricKind {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.write_str(match self {
-            BillingMetricKind::WrittenSize => "written_size",
-            BillingMetricKind::SyntheticStorageSize => "synthetic_storage_size",
-            BillingMetricKind::ResidentSize => "resident_size",
-            BillingMetricKind::RemoteStorageSize => "remote_storage_size",
+            ConsumptionMetricKind::WrittenSize => "written_size",
+            ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
+            ConsumptionMetricKind::ResidentSize => "resident_size",
+            ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
         })
     }
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct BillingMetricsKey {
+pub struct ConsumptionMetricsKey {
     tenant_id: TenantId,
     timeline_id: Option<TimelineId>,
-    metric: BillingMetricKind,
+    metric: ConsumptionMetricKind,
 }
 
 #[derive(serde::Serialize)]
 struct EventChunk<'a> {
-    events: &'a [BillingMetric],
+    events: &'a [ConsumptionMetric],
 }
 
 /// Main thread that serves metrics collection
@@ -145,7 +145,7 @@ pub async fn collect_metrics(
 
     // define client here to reuse it for all requests
     let client = reqwest::Client::new();
-    let mut cached_metrics: HashMap<BillingMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
 
     loop {
         tokio::select! {
@@ -166,11 +166,11 @@ pub async fn collect_metrics(
 /// Cache metrics to avoid sending the same metrics multiple times.
 pub async fn collect_metrics_task(
     client: &reqwest::Client,
-    cached_metrics: &mut HashMap<BillingMetricsKey, u64>,
+    cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
     node_id: NodeId,
 ) -> anyhow::Result<()> {
-    let mut current_metrics: Vec<(BillingMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
     trace!(
         "starting collect_metrics_task. metric_collection_endpoint: {}",
         metric_collection_endpoint
@@ -194,10 +194,10 @@ pub async fn collect_metrics_task(
             let timeline_written_size = u64::from(timeline.get_last_record_lsn());
 
             current_metrics.push((
-                BillingMetricsKey {
+                ConsumptionMetricsKey {
                     tenant_id,
                     timeline_id: Some(timeline.timeline_id),
-                    metric: BillingMetricKind::WrittenSize,
+                    metric: ConsumptionMetricKind::WrittenSize,
                 },
                 timeline_written_size,
             ));
@@ -217,19 +217,19 @@ pub async fn collect_metrics_task(
         );
 
         current_metrics.push((
-            BillingMetricsKey {
+            ConsumptionMetricsKey {
                 tenant_id,
                 timeline_id: None,
-                metric: BillingMetricKind::ResidentSize,
+                metric: ConsumptionMetricKind::ResidentSize,
             },
             tenant_resident_size,
         ));
 
         current_metrics.push((
-            BillingMetricsKey {
+            ConsumptionMetricsKey {
                 tenant_id,
                 timeline_id: None,
-                metric: BillingMetricKind::RemoteStorageSize,
+                metric: ConsumptionMetricKind::RemoteStorageSize,
             },
             tenant_remote_size,
         ));
@@ -253,7 +253,7 @@ pub async fn collect_metrics_task(
     const CHUNK_SIZE: usize = 1000;
     let chunks = current_metrics.chunks(CHUNK_SIZE);
 
-    let mut chunk_to_send: Vec<BillingMetric> = Vec::with_capacity(1000);
+    let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
 
     for chunk in chunks {
         chunk_to_send.clear();
@@ -264,7 +264,7 @@ pub async fn collect_metrics_task(
             // enrich metrics with timestamp and metric_kind before sending
             let mut rng = rand::thread_rng();
             chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
-                BillingMetric::new_absolute(
+                ConsumptionMetric::new_absolute(
                     curr_key.metric,
                     curr_key.tenant_id,
                     curr_key.timeline_id,
@@ -278,7 +278,7 @@ pub async fn collect_metrics_task(
         let chunk_json = serde_json::value::to_raw_value(&EventChunk {
             events: &chunk_to_send,
         })
-        .expect("BillingMetric should not fail serialization");
+        .expect("ConsumptionMetric should not fail serialization");
 
         let res = client
             .post(metric_collection_endpoint.clone())
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 29050a5bc2..2f78c199b9 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,7 +1,7 @@
 mod auth;
 pub mod basebackup;
-pub mod billing_metrics;
 pub mod config;
+pub mod consumption_metrics;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;

From 434fcac357ef0e5826be93fb7695e6faa1c70c38 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Fri, 23 Dec 2022 10:23:28 +0100
Subject: [PATCH 085/132] Remove unused HTTP endpoints from compute_ctl

---
 compute_tools/src/http/api.rs            | 30 +-------------
 compute_tools/src/http/openapi_spec.yaml | 51 ------------------------
 2 files changed, 1 insertion(+), 80 deletions(-)

diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 4c8bbc608b..44f83e5003 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,29 +9,11 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use log::{error, info};
 use serde_json;
 
-use crate::compute::{ComputeNode, ComputeStatus};
+use crate::compute::ComputeNode;
 
 // Service function to handle all available routes.
 async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
     match (req.method(), req.uri().path()) {
-        // Timestamp of the last Postgres activity in the plain text.
-        // DEPRECATED in favour of /status
-        (&Method::GET, "/last_activity") => {
-            info!("serving /last_active GET request");
-            let state = compute.state.read().unwrap();
-
-            // Use RFC3339 format for consistency.
-            Response::new(Body::from(state.last_active.to_rfc3339()))
-        }
-
-        // Has compute setup process finished? -> true/false.
-        // DEPRECATED in favour of /status
-        (&Method::GET, "/ready") => {
-            info!("serving /ready GET request");
-            let status = compute.get_status();
-            Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
-        }
-
         // Serialized compute state.
         (&Method::GET, "/status") => {
             info!("serving /status GET request");
@@ -46,16 +28,6 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
             Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
         }
 
-        // DEPRECATED, use POST instead
-        (&Method::GET, "/check_writability") => {
-            info!("serving /check_writability GET request");
-            let res = crate::checker::check_writability(&compute).await;
-            match res {
-                Ok(_) => Response::new(Body::from("true")),
-                Err(e) => Response::new(Body::from(e.to_string())),
-            }
-        }
-
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
             let res = crate::checker::check_writability(&compute).await;
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 9c0f8e3ccd..a857531d26 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,58 +37,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeMetrics"
 
-  /ready:
-    get:
-      deprecated: true
-      tags:
-      - "info"
-      summary: Check whether compute startup process finished successfully
-      description: ""
-      operationId: computeIsReady
-      responses:
-        "200":
-          description: Compute is ready ('true') or not ('false')
-          content:
-            text/plain:
-              schema:
-                type: string
-                example: "true"
-
-  /last_activity:
-    get:
-      deprecated: true
-      tags:
-      - "info"
-      summary: Get timestamp of the last compute activity
-      description: ""
-      operationId: getLastComputeActivityTS
-      responses:
-        "200":
-          description: Timestamp of the last compute activity
-          content:
-            text/plain:
-              schema:
-                type: string
-                example: "2022-10-12T07:20:50.52Z"
-
   /check_writability:
-    get:
-      deprecated: true
-      tags:
-      - "check"
-      summary: Check that we can write new data on this compute
-      description: ""
-      operationId: checkComputeWritabilityDeprecated
-      responses:
-        "200":
-          description: Check result
-          content:
-            text/plain:
-              schema:
-                type: string
-                description: Error text or 'true' if check passed
-                example: "true"
-
     post:
       tags:
       - "check"

From fefe19a284c851a4f74abe83f8d478263163260d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 28 Dec 2022 10:48:27 +0200
Subject: [PATCH 086/132] Avoid calling find_lsn_for_timestamp call while
 holding lock.

Refactor update_gc_info function so that it calls the potentially
expensive find_lsn_for_timestamp() function before acquiring the
lock. This will also be needed if we make find_lsn_for_timestamp()
async in the future; it cannot be awaited while holding the lock.
---
 pageserver/src/tenant/timeline.rs | 67 +++++++++++++++++++------------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 951f217cf9..ec4b3ae665 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2674,29 +2674,27 @@ impl Timeline {
     ///
     /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
     /// whether a record is needed for PITR.
+    ///
+    /// NOTE: This function holds a short-lived lock to protect the 'gc_info'
+    /// field, so that the three values passed as argument are stored
+    /// atomically. But the caller is responsible for ensuring that no new
+    /// branches are created that would need to be included in 'retain_lsns',
+    /// for example. The caller should hold `Tenant::gc_cs` lock to ensure
+    /// that.
+    ///
     pub(super) async fn update_gc_info(
         &self,
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
     ) -> anyhow::Result<()> {
-        let mut gc_info = self.gc_info.write().unwrap();
-
-        gc_info.horizon_cutoff = cutoff_horizon;
-        gc_info.retain_lsns = retain_lsns;
-
-        // Calculate pitr cutoff point.
-        // If we cannot determine a cutoff LSN, be conservative and don't GC anything.
-        let mut pitr_cutoff_lsn: Lsn;
-
-        if pitr != Duration::ZERO {
-            // conservative, safe default is to remove nothing, when we have no
-            // commit timestamp data available
-            pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn();
-
-            // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
-            // If we don't have enough data to convert to LSN,
-            // play safe and don't remove any layers.
+        // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
+        //
+        // Some unit tests depend on garbage-collection working even when
+        // CLOG data is missing, so that find_lsn_for_timestamp() doesn't
+        // work, so avoid calling it altogether if time-based retention is not
+        // configured. It would be pointless anyway.
+        let pitr_cutoff = if pitr != Duration::ZERO {
             let now = SystemTime::now();
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
@@ -2705,27 +2703,44 @@ impl Timeline {
                     .find_lsn_for_timestamp(pitr_timestamp)
                     .no_ondemand_download()?
                 {
-                    LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
+                    LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {
+                        // The timestamp is in the future. That sounds impossible,
+                        // but what it really means is that there hasn't been
+                        // any commits since the cutoff timestamp.
                         debug!("future({})", lsn);
-                        pitr_cutoff_lsn = gc_info.horizon_cutoff;
+                        cutoff_horizon
                     }
                     LsnForTimestamp::Past(lsn) => {
                         debug!("past({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
                     }
                     LsnForTimestamp::NoData(lsn) => {
                         debug!("nodata({})", lsn);
+                        // conservative, safe default is to remove nothing, when we
+                        // have no commit timestamp data available
+                        *self.get_latest_gc_cutoff_lsn()
                     }
                 }
-                debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
+            } else {
+                // If we don't have enough data to convert to LSN,
+                // play safe and don't remove any layers.
+                *self.get_latest_gc_cutoff_lsn()
             }
         } else {
-            // No time-based retention. (Some unit tests depend on garbage-collection
-            // working even when CLOG data is missing, so that find_lsn_for_timestamp()
-            // above doesn't work.)
-            pitr_cutoff_lsn = gc_info.horizon_cutoff;
-        }
-        gc_info.pitr_cutoff = pitr_cutoff_lsn;
+            // No time-based retention was configured. Set time-based cutoff to
+            // same as LSN based.
+            cutoff_horizon
+        };
+
+        // Grab the lock and update the values
+        *self.gc_info.write().unwrap() = GcInfo {
+            retain_lsns,
+            horizon_cutoff: cutoff_horizon,
+            pitr_cutoff,
+        };
 
         Ok(())
     }

From 890ff3803e2413be9e641ce0d2be23b3ea9b5a6d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 28 Dec 2022 10:48:33 +0200
Subject: [PATCH 087/132] Allow update_gc_info to download files on-demand.

---
 pageserver/src/tenant/timeline.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ec4b3ae665..df02f24239 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2699,9 +2699,7 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self
-                    .find_lsn_for_timestamp(pitr_timestamp)
-                    .no_ondemand_download()?
+                match with_ondemand_download(|| self.find_lsn_for_timestamp(pitr_timestamp)).await?
                 {
                     LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {

From 8ff7bc5df1b7644825c5379474b7715f3bb2ab21 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 27 Dec 2022 19:45:09 +0200
Subject: [PATCH 088/132] Add timleline_logical_size metric. Send this metric
 only when it is fully calculated.

Make consumption metrics more stable:
- Send per-timeline metrics only for active timelines.
- Adjust test assertions to make test_metric_collection test more stable.
---
 pageserver/src/consumption_metrics.rs         | 43 +++++++++++++------
 pageserver/src/http/routes.rs                 |  2 +-
 pageserver/src/tenant/timeline.rs             |  8 +++-
 .../src/walreceiver/walreceiver_connection.rs |  7 +--
 test_runner/regress/test_metric_collection.py | 16 ++++++-
 5 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 0d96eb431d..c411a9e025 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -94,6 +94,9 @@ pub enum ConsumptionMetricKind {
     /// Size of the remote storage (S3) directory.
     /// This is an absolute, per-tenant metric.
     RemoteStorageSize,
+    /// Logical size of the data in the timeline
+    /// This is an absolute, per-timeline metric
+    TimelineLogicalSize,
 }
 
 impl FromStr for ConsumptionMetricKind {
@@ -105,6 +108,7 @@ impl FromStr for ConsumptionMetricKind {
             "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
             "resident_size" => Ok(Self::ResidentSize),
             "remote_storage_size" => Ok(Self::RemoteStorageSize),
+            "timeline_logical_size" => Ok(Self::TimelineLogicalSize),
             _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
         }
     }
@@ -117,6 +121,7 @@ impl fmt::Display for ConsumptionMetricKind {
             ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
             ConsumptionMetricKind::ResidentSize => "resident_size",
             ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
+            ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
         })
     }
 }
@@ -191,23 +196,35 @@ pub async fn collect_metrics_task(
 
         // iterate through list of timelines in tenant
         for timeline in tenant.list_timelines().iter() {
-            let timeline_written_size = u64::from(timeline.get_last_record_lsn());
+            // collect per-timeline metrics only for active timelines
+            if timeline.is_active() {
+                let timeline_written_size = u64::from(timeline.get_last_record_lsn());
 
-            current_metrics.push((
-                ConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: Some(timeline.timeline_id),
-                    metric: ConsumptionMetricKind::WrittenSize,
-                },
-                timeline_written_size,
-            ));
+                current_metrics.push((
+                    ConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: ConsumptionMetricKind::WrittenSize,
+                    },
+                    timeline_written_size,
+                ));
+
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                // Only send timeline logical size when it is fully calculated.
+                if is_exact {
+                    current_metrics.push((
+                        ConsumptionMetricsKey {
+                            tenant_id,
+                            timeline_id: Some(timeline.timeline_id),
+                            metric: ConsumptionMetricKind::TimelineLogicalSize,
+                        },
+                        timeline_logical_size,
+                    ));
+                }
+            }
 
             let timeline_resident_size = timeline.get_resident_physical_size();
             tenant_resident_size += timeline_resident_size;
-
-            debug!(
-                "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
-                tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
         }
 
         let tenant_remote_size = tenant.get_remote_size().await?;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 66a1607801..4f4c397abe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -120,7 +120,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         lsn @ Lsn(_) => Some(lsn),
     };
     let current_logical_size = match timeline.get_current_logical_size() {
-        Ok(size) => Some(size),
+        Ok((size, _)) => Some(size),
         Err(err) => {
             error!("Timeline info creation failed to get current logical size: {err:?}");
             None
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index df02f24239..2c22c6694d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -752,18 +752,22 @@ impl Timeline {
     ///
     /// The size could be lagging behind the actual number, in case
     /// the initial size calculation has not been run (gets triggered on the first size access).
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
+    ///
+    /// return size and boolean flag that shows if the size is exact
+    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
         let current_size = self.current_logical_size.current_size()?;
         debug!("Current size: {current_size:?}");
 
+        let mut is_exact = true;
         let size = current_size.size();
         if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
             (current_size, self.current_logical_size.initial_part_end)
         {
+            is_exact = false;
             self.try_spawn_size_init_task(init_lsn);
         }
 
-        Ok(size)
+        Ok((size, is_exact))
     }
 
     /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index a98126e683..3753807327 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -335,10 +335,11 @@ pub async fn handle_walreceiver_connection(
 
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
+            let (timeline_logical_size, _) = timeline
+                .get_current_logical_size()
+                .context("Status update creation failed to get current logical size")?;
             let status_update = ReplicationFeedback {
-                current_timeline_size: timeline
-                    .get_current_logical_size()
-                    .context("Status update creation failed to get current logical size")?,
+                current_timeline_size: timeline_logical_size,
                 ps_writelsn: write_lsn,
                 ps_flushlsn: flush_lsn,
                 ps_applylsn: apply_lsn,
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index a3b3609153..ac9f163801 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -42,16 +42,28 @@ def metrics_handler(request: Request) -> Response:
         # >= 0 check here is to avoid race condition when we receive metrics before
         # remote_uploaded is updated
         "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+        # logical size may lag behind the actual size, so allow 0 here
+        "timeline_logical_size": lambda value: value >= 0,
     }
 
+    events_received = 0
     for event in events:
-        assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
+        check = checks.get(event["metric"])
+        # calm down mypy
+        if check is not None:
+            assert check(event["value"]), f"{event['metric']} isn't valid"
+            events_received += 1
 
     global first_request
     # check that all checks were sent
     # but only on the first request, because we don't send non-changed metrics
     if first_request:
-        assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+        # we may receive more metrics than we check,
+        # because there are two timelines
+        # and we may receive per-timeline metrics from both
+        # if the test was slow enough for these metrics to be collected
+        # -1 because that is ok to not receive timeline_logical_size
+        assert events_received >= len(checks) - 1
         first_request = False
 
     global num_metrics_received

From 7c7d225d9805d34cfe5071798e0ae6af4cf38df5 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 29 Dec 2022 16:08:21 +0200
Subject: [PATCH 089/132] add pageserver to new region see
 https://github.com/neondatabase/aws/pull/116

---
 .github/ansible/prod.us-west-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 7d6e49bf9c..9eb422a3ae 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -25,6 +25,8 @@ storage:
           ansible_host: i-0d9f6dfae0e1c780d 
         pageserver-1.us-west-2.aws.neon.tech:
           ansible_host: i-0c834be1dddba8b3f
+        pageserver-2.us-west-2.aws.neon.tech:
+          ansible_host: i-051642d372c0a4f32
 
     safekeepers:
       hosts:

From c700c7db2e89056fa53a89d59ca42a143c46cea7 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Mon, 26 Dec 2022 22:10:28 +0300
Subject: [PATCH 090/132] [proxy] Add more labels to the pricing metrics

---
 proxy/src/auth/backend.rs          |  45 ++-----
 proxy/src/auth/backend/console.rs  |  44 +++----
 proxy/src/auth/backend/link.rs     |   2 +-
 proxy/src/auth/backend/postgres.rs |  11 +-
 proxy/src/compute.rs               |   2 +-
 proxy/src/console.rs               |   5 +
 proxy/src/console/messages.rs      | 190 +++++++++++++++++++++++++++++
 proxy/src/main.rs                  |   1 +
 proxy/src/mgmt.rs                  | 100 +--------------
 proxy/src/proxy.rs                 |  18 +--
 proxy/src/proxy/tests.rs           |   2 +-
 test_runner/regress/test_proxy.py  |   6 +-
 12 files changed, 249 insertions(+), 177 deletions(-)
 create mode 100644 proxy/src/console.rs
 create mode 100644 proxy/src/console/messages.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4b937f017a..4adf0ed940 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -8,7 +8,9 @@ pub use console::{GetAuthInfoError, WakeComputeError};
 
 use crate::{
     auth::{self, AuthFlow, ClientCredentials},
-    compute, http, mgmt, stream, url,
+    compute,
+    console::messages::MetricsAuxInfo,
+    http, mgmt, stream, url,
     waiters::{self, Waiter, Waiters},
 };
 use once_cell::sync::Lazy;
@@ -126,25 +128,13 @@ pub struct AuthSuccess<T> {
     pub value: T,
 }
 
-impl<T> AuthSuccess<T> {
-    /// Very similar to [`std::option::Option::map`].
-    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
-    /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
-        AuthSuccess {
-            reported_auth_ok: self.reported_auth_ok,
-            value: f(self.value),
-        }
-    }
-}
-
 /// Info for establishing a connection to a compute node.
 /// This is what we get after auth succeeded, but not before!
 pub struct NodeInfo {
-    /// Project from [`auth::ClientCredentials`].
-    pub project: String,
     /// Compute node connection params.
     pub config: compute::ConnCfg,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
 }
 
 impl BackendType<'_, ClientCredentials<'_>> {
@@ -172,37 +162,34 @@ impl BackendType<'_, ClientCredentials<'_>> {
         };
 
         // TODO: find a proper way to merge those very similar blocks.
-        let (mut config, payload) = match self {
+        let (mut node, payload) = match self {
             Console(endpoint, creds) if creds.project.is_none() => {
                 let payload = fetch_magic_payload.await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
-                let config = console::Api::new(endpoint, extra, &creds)
+                let node = console::Api::new(endpoint, extra, &creds)
                     .wake_compute()
                     .await?;
 
-                (config, payload)
+                (node, payload)
             }
             Postgres(endpoint, creds) if creds.project.is_none() => {
                 let payload = fetch_magic_payload.await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
-                let config = postgres::Api::new(endpoint, &creds).wake_compute().await?;
+                let node = postgres::Api::new(endpoint, &creds).wake_compute().await?;
 
-                (config, payload)
+                (node, payload)
             }
             _ => return Ok(None),
         };
 
-        config.password(payload.password);
+        node.config.password(payload.password);
         Ok(Some(AuthSuccess {
             reported_auth_ok: false,
-            value: NodeInfo {
-                project: payload.project,
-                config,
-            },
+            value: node,
         }))
     }
 
@@ -233,10 +220,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                 console::Api::new(&endpoint, extra, &creds)
                     .handle_user(client)
                     .await?
-                    .map(|config| NodeInfo {
-                        project: creds.project.unwrap().into_owned(),
-                        config,
-                    })
             }
             Postgres(endpoint, creds) => {
                 info!("performing mock authentication using a local postgres instance");
@@ -245,10 +228,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                 postgres::Api::new(&endpoint, &creds)
                     .handle_user(client)
                     .await?
-                    .map(|config| NodeInfo {
-                        project: creds.project.unwrap().into_owned(),
-                        config,
-                    })
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs
index 040870fc8e..b3e3fd0c10 100644
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -1,16 +1,16 @@
 //! Cloud API V2.
 
-use super::{AuthSuccess, ConsoleReqExtra};
+use super::{AuthSuccess, ConsoleReqExtra, NodeInfo};
 use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
+    console::messages::{ConsoleError, GetRoleSecret, WakeCompute},
     error::{io_error, UserFacingError},
     http, sasl, scram,
     stream::PqStream,
 };
 use futures::TryFutureExt;
 use reqwest::StatusCode as HttpStatusCode;
-use serde::Deserialize;
 use std::future::Future;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -136,24 +136,6 @@ impl UserFacingError for WakeComputeError {
     }
 }
 
-/// Console's response which holds client's auth secret.
-#[derive(Deserialize, Debug)]
-struct GetRoleSecret {
-    role_secret: Box<str>,
-}
-
-/// Console's response which holds compute node's `host:port` pair.
-#[derive(Deserialize, Debug)]
-struct WakeCompute {
-    address: Box<str>,
-}
-
-/// Console's error response with human-readable description.
-#[derive(Deserialize, Debug)]
-struct ConsoleError {
-    error: Box<str>,
-}
-
 /// Auth secret which is managed by the cloud.
 pub enum AuthInfo {
     /// Md5 hash of user's password.
@@ -194,7 +176,7 @@ impl<'a> Api<'a> {
     pub(super) async fn handle_user(
         &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
         handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
 }
@@ -238,7 +220,7 @@ impl Api<'_> {
     }
 
     /// Wake up the compute node and return the corresponding connection info.
-    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let request_id = uuid::Uuid::new_v4().to_string();
         async {
             let request = self
@@ -269,7 +251,10 @@ impl Api<'_> {
                 .dbname(self.creds.dbname)
                 .user(self.creds.user);
 
-            Ok(config)
+            Ok(NodeInfo {
+                config,
+                aux: body.aux,
+            })
         }
         .map_err(crate::error::log_error)
         .instrument(info_span!("wake_compute", id = request_id))
@@ -284,11 +269,11 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
     endpoint: &'a Endpoint,
     get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo,
     wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
-) -> auth::Result<AuthSuccess<compute::ConnCfg>>
+) -> auth::Result<AuthSuccess<NodeInfo>>
 where
     Endpoint: AsRef<ClientCredentials<'a>>,
     GetAuthInfo: Future<Output = Result<Option<AuthInfo>, GetAuthInfoError>>,
-    WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
+    WakeCompute: Future<Output = Result<NodeInfo, WakeComputeError>>,
 {
     let creds = endpoint.as_ref();
 
@@ -325,19 +310,20 @@ where
         }
     };
 
-    let mut config = wake_compute(endpoint).await?;
+    let mut node = wake_compute(endpoint).await?;
     if let Some(keys) = scram_keys {
-        config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
+        use tokio_postgres::config::AuthKeys;
+        node.config.auth_keys(AuthKeys::ScramSha256(keys));
     }
 
     Ok(AuthSuccess {
         reported_auth_ok: false,
-        value: config,
+        value: node,
     })
 }
 
 /// Parse http response body, taking status code into account.
-async fn parse_body<T: for<'a> Deserialize<'a>>(
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     response: reqwest::Response,
 ) -> Result<T, ApiError> {
     let status = response.status();
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 641519ac50..e16bbc70e4 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -86,8 +86,8 @@ pub async fn handle_user(
     Ok(AuthSuccess {
         reported_auth_ok: true,
         value: NodeInfo {
-            project: db_info.project,
             config,
+            aux: db_info.aux,
         },
     })
 }
diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs
index 8f16dc9fa8..260342f103 100644
--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -2,7 +2,7 @@
 
 use super::{
     console::{self, AuthInfo, GetAuthInfoError, WakeComputeError},
-    AuthSuccess,
+    AuthSuccess, NodeInfo,
 };
 use crate::{
     auth::{self, ClientCredentials},
@@ -57,7 +57,7 @@ impl<'a> Api<'a> {
     pub(super) async fn handle_user(
         &'a self,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
         // We reuse user handling logic from a production module.
         console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await
     }
@@ -103,7 +103,7 @@ impl Api<'_> {
     }
 
     /// We don't need to wake anything locally, so we just return the connection info.
-    pub async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+    pub async fn wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
         let mut config = compute::ConnCfg::new();
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
@@ -111,7 +111,10 @@ impl Api<'_> {
             .dbname(self.creds.dbname)
             .user(self.creds.user);
 
-        Ok(config)
+        Ok(NodeInfo {
+            config,
+            aux: Default::default(),
+        })
     }
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 71421a4a65..094db73061 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -43,7 +43,7 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
 /// Eventually, `tokio_postgres` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[repr(transparent)]
-pub struct ConnCfg(pub tokio_postgres::Config);
+pub struct ConnCfg(Box<tokio_postgres::Config>);
 
 impl ConnCfg {
     /// Construct a new connection config.
diff --git a/proxy/src/console.rs b/proxy/src/console.rs
new file mode 100644
index 0000000000..78f09ac9e1
--- /dev/null
+++ b/proxy/src/console.rs
@@ -0,0 +1,5 @@
+///! Various stuff for dealing with the Neon Console.
+///! Later we might move some API wrappers here.
+
+/// Payloads used in the console's APIs.
+pub mod messages;
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
new file mode 100644
index 0000000000..63a97069b8
--- /dev/null
+++ b/proxy/src/console/messages.rs
@@ -0,0 +1,190 @@
+use serde::Deserialize;
+use std::fmt;
+
+/// Generic error response with human-readable description.
+/// Note that we can't always present it to user as is.
+#[derive(Debug, Deserialize)]
+pub struct ConsoleError {
+    pub error: Box<str>,
+}
+
+/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
+/// Returned by the `/proxy_get_role_secret` API method.
+#[derive(Deserialize)]
+pub struct GetRoleSecret {
+    pub role_secret: Box<str>,
+}
+
+// Manually implement debug to omit sensitive info.
+impl fmt::Debug for GetRoleSecret {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("GetRoleSecret").finish_non_exhaustive()
+    }
+}
+
+/// Response which holds compute node's `host:port` pair.
+/// Returned by the `/proxy_wake_compute` API method.
+#[derive(Debug, Deserialize)]
+pub struct WakeCompute {
+    pub address: Box<str>,
+    pub aux: MetricsAuxInfo,
+}
+
+/// Async response which concludes the link auth flow.
+/// Also known as `kickResponse` in the console.
+#[derive(Debug, Deserialize)]
+pub struct KickSession<'a> {
+    /// Session ID is assigned by the proxy.
+    pub session_id: &'a str,
+
+    /// Compute node connection params.
+    #[serde(deserialize_with = "KickSession::parse_db_info")]
+    pub result: DatabaseInfo,
+}
+
+impl KickSession<'_> {
+    fn parse_db_info<'de, D>(des: D) -> Result<DatabaseInfo, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        enum Wrapper {
+            // Currently, console only reports `Success`.
+            // `Failure(String)` used to be here... RIP.
+            Success(DatabaseInfo),
+        }
+
+        Wrapper::deserialize(des).map(|x| match x {
+            Wrapper::Success(info) => info,
+        })
+    }
+}
+
+/// Compute node connection params.
+#[derive(Deserialize)]
+pub struct DatabaseInfo {
+    pub host: String,
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+    /// Console always provides a password, but it might
+    /// be inconvenient for debug with local PG instance.
+    pub password: Option<String>,
+    pub aux: MetricsAuxInfo,
+}
+
+// Manually implement debug to omit sensitive info.
+impl fmt::Debug for DatabaseInfo {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("DatabaseInfo")
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .field("dbname", &self.dbname)
+            .field("user", &self.user)
+            .finish_non_exhaustive()
+    }
+}
+
+/// Various labels for prometheus metrics.
+/// Also known as `ProxyMetricsAuxInfo` in the console.
+#[derive(Debug, Deserialize, Default)]
+pub struct MetricsAuxInfo {
+    pub endpoint_id: Box<str>,
+    pub project_id: Box<str>,
+    pub branch_id: Box<str>,
+}
+
+impl MetricsAuxInfo {
+    /// Definitions of labels for traffic metric.
+    pub const TRAFFIC_LABELS: &'static [&'static str] = &[
+        // Received (rx) / sent (tx).
+        "direction",
+        // ID of a project.
+        "project_id",
+        // ID of an endpoint within a project.
+        "endpoint_id",
+        // ID of a branch within a project (snapshot).
+        "branch_id",
+    ];
+
+    /// Values of labels for traffic metric.
+    // TODO: add more type safety (validate arity & positions).
+    pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] {
+        [
+            direction,
+            &self.project_id,
+            &self.endpoint_id,
+            &self.branch_id,
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    fn dummy_aux() -> serde_json::Value {
+        json!({
+            "endpoint_id": "endpoint",
+            "project_id": "project",
+            "branch_id": "branch",
+        })
+    }
+
+    #[test]
+    fn parse_kick_session() -> anyhow::Result<()> {
+        // This is what the console's kickResponse looks like.
+        let json = json!({
+            "session_id": "deadbeef",
+            "result": {
+                "Success": {
+                    "host": "localhost",
+                    "port": 5432,
+                    "dbname": "postgres",
+                    "user": "john_doe",
+                    "password": "password",
+                    "aux": dummy_aux(),
+                }
+            }
+        });
+        let _: KickSession = serde_json::from_str(&json.to_string())?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_db_info() -> anyhow::Result<()> {
+        // with password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "aux": dummy_aux(),
+        }))?;
+
+        // without password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "aux": dummy_aux(),
+        }))?;
+
+        // new field (forward compatibility)
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "project": "hello_world",
+            "N.E.W": "forward compatibility check",
+            "aux": dummy_aux(),
+        }))?;
+
+        Ok(())
+    }
+}
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 2855d1f900..89ea9142a9 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -8,6 +8,7 @@ mod auth;
 mod cancellation;
 mod compute;
 mod config;
+mod console;
 mod error;
 mod http;
 mod mgmt;
diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs
index 23e10b5a9b..2e0a502e7f 100644
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -1,7 +1,9 @@
-use crate::auth;
+use crate::{
+    auth,
+    console::messages::{DatabaseInfo, KickSession},
+};
 use anyhow::Context;
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use serde::Deserialize;
 use std::{
     net::{TcpListener, TcpStream},
     thread,
@@ -50,59 +52,9 @@ fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
     pgbackend.run(&mut MgmtHandler)
 }
 
-/// Known as `kickResponse` in the console.
-#[derive(Debug, Deserialize)]
-struct PsqlSessionResponse {
-    session_id: String,
-    result: PsqlSessionResult,
-}
-
-#[derive(Debug, Deserialize)]
-enum PsqlSessionResult {
-    Success(DatabaseInfo),
-    Failure(String),
-}
-
 /// A message received by `mgmt` when a compute node is ready.
 pub type ComputeReady = Result<DatabaseInfo, String>;
 
-impl PsqlSessionResult {
-    fn into_compute_ready(self) -> ComputeReady {
-        match self {
-            Self::Success(db_info) => Ok(db_info),
-            Self::Failure(message) => Err(message),
-        }
-    }
-}
-
-/// Compute node connection params provided by the console.
-/// This struct and its parents are mgmt API implementation
-/// detail and thus should remain in this module.
-// TODO: restore deserialization tests from git history.
-#[derive(Deserialize)]
-pub struct DatabaseInfo {
-    pub host: String,
-    pub port: u16,
-    pub dbname: String,
-    pub user: String,
-    /// Console always provides a password, but it might
-    /// be inconvenient for debug with local PG instance.
-    pub password: Option<String>,
-    pub project: String,
-}
-
-// Manually implement debug to omit sensitive info.
-impl std::fmt::Debug for DatabaseInfo {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
-        fmt.debug_struct("DatabaseInfo")
-            .field("host", &self.host)
-            .field("port", &self.port)
-            .field("dbname", &self.dbname)
-            .field("user", &self.user)
-            .finish_non_exhaustive()
-    }
-}
-
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
 impl postgres_backend::Handler for MgmtHandler {
@@ -115,13 +67,13 @@ impl postgres_backend::Handler for MgmtHandler {
 }
 
 fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
-    let resp: PsqlSessionResponse = serde_json::from_str(query)?;
+    let resp: KickSession = serde_json::from_str(query)?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
     info!("got response: {:?}", resp.result);
 
-    match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
+    match auth::backend::notify(resp.session_id, Ok(resp.result)) {
         Ok(()) => {
             pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                 .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
@@ -135,43 +87,3 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
 
     Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn parse_db_info() -> anyhow::Result<()> {
-        // with password
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "password": "password",
-            "project": "hello_world",
-        }))?;
-
-        // without password
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "project": "hello_world",
-        }))?;
-
-        // new field (forward compatibility)
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "project": "hello_world",
-            "N.E.W": "forward compatibility check",
-        }))?;
-
-        Ok(())
-    }
-}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 713388c625..382f7cd918 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -11,7 +11,7 @@ use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use pq_proto::{BeMessage as Be, *};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, info_span, Instrument};
@@ -39,12 +39,7 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "proxy_io_bytes_per_client",
         "Number of bytes sent/received between client and backend.",
-        &[
-            // Received (rx) / sent (tx).
-            "direction",
-            // Proxy can keep calling it `project` internally.
-            "endpoint_id"
-        ]
+        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
     )
     .unwrap()
 });
@@ -271,19 +266,16 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
 
         stream
             .write_message_noflush(&Be::BackendKeyData(cancel_key_data))?
-            .write_message(&BeMessage::ReadyForQuery)
+            .write_message(&Be::ReadyForQuery)
             .await?;
 
-        // TODO: add more identifiers.
-        let metric_id = node.project;
-
-        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]);
+        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("tx"));
         let mut client = MeasuredStream::new(stream.into_inner(), |cnt| {
             // Number of bytes we sent to the client (outbound).
             m_sent.inc_by(cnt as u64);
         });
 
-        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]);
+        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("rx"));
         let mut db = MeasuredStream::new(db.stream, |cnt| {
             // Number of bytes the client sent to the compute node (inbound).
             m_recv.inc_by(cnt as u64);
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 2f023844d0..ed429df421 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -140,7 +140,7 @@ async fn dummy_proxy(
     stream
         .write_message_noflush(&Be::AuthenticationOk)?
         .write_message_noflush(&Be::CLIENT_ENCODING)?
-        .write_message(&BeMessage::ReadyForQuery)
+        .write_message(&Be::ReadyForQuery)
         .await?;
 
     Ok(())
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index bcea4d970c..e13ba51f4b 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -63,7 +63,11 @@ async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProx
                         "port": local_vanilla_pg.default_options["port"],
                         "dbname": local_vanilla_pg.default_options["dbname"],
                         "user": pg_user,
-                        "project": "irrelevant",
+                        "aux": {
+                            "project_id": "project",
+                            "endpoint_id": "endpoint",
+                            "branch_id": "branch",
+                        },
                     }
                 },
             }

From cb619449822d223a5012feb292ba2ea9975df33f Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Fri, 16 Dec 2022 19:39:38 +0200
Subject: [PATCH 091/132] Safekeeper: refactor auth validation

* Load public auth key on startup and store it in the config.
* Get rid of a separate `auth` parameter which was passed all over the place.
---
 safekeeper/src/bin/safekeeper.rs | 34 +++++++++++++++-----------------
 safekeeper/src/handler.rs        | 10 ++++------
 safekeeper/src/http/routes.rs    |  8 +++-----
 safekeeper/src/lib.rs            |  6 ++++--
 safekeeper/src/wal_service.rs    | 33 +++++++++----------------------
 5 files changed, 36 insertions(+), 55 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 5ad88276e8..394a4815bb 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -143,6 +143,19 @@ fn main() -> anyhow::Result<()> {
         return Ok(());
     }
 
+    let auth = match args.auth_validation_public_key_path.as_ref() {
+        None => {
+            info!("auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!("loading JWT auth key from {}", path.display());
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+
     let conf = SafeKeeperConf {
         workdir,
         my_id: id,
@@ -156,7 +169,7 @@ fn main() -> anyhow::Result<()> {
         max_offloader_lag_bytes: args.max_offloader_lag,
         backup_runtime_threads: args.wal_backup_threads,
         wal_backup_enabled: !args.disable_wal_backup,
-        auth_validation_public_key_path: args.auth_validation_public_key_path,
+        auth,
     };
 
     // initialize sentry if SENTRY_DSN is provided
@@ -186,19 +199,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         e
     })?;
 
-    let auth = match conf.auth_validation_public_key_path.as_ref() {
-        None => {
-            info!("auth is disabled");
-            None
-        }
-        Some(path) => {
-            info!("loading JWT auth key from {}", path.display());
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
-        }
-    };
-
     // Register metrics collector for active timelines. It's important to do this
     // after daemonizing, otherwise process collector will be upset.
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
@@ -212,12 +212,11 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
 
     let conf_ = conf.clone();
-    let auth_ = auth.clone();
     threads.push(
         thread::Builder::new()
             .name("http_endpoint_thread".into())
             .spawn(|| {
-                let router = http::make_router(conf_, auth_);
+                let router = http::make_router(conf_);
                 endpoint::serve_thread_main(
                     router,
                     http_listener,
@@ -231,7 +230,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let safekeeper_thread = thread::Builder::new()
         .name("safekeeper thread".into())
         .spawn(|| {
-            if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) {
+            if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) {
                 info!("safekeeper thread terminated: {e}");
             }
         })
@@ -244,7 +243,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         thread::Builder::new()
             .name("broker thread".into())
             .spawn(|| {
-                // TODO: add auth?
                 broker::thread_main(conf_);
             })?,
     );
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 05527303ca..c692e9fc12 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -15,9 +15,8 @@ use regex::Regex;
 
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use std::str;
-use std::sync::Arc;
 use tracing::info;
-use utils::auth::{Claims, JwtAuth, Scope};
+use utils::auth::{Claims, Scope};
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -32,7 +31,6 @@ pub struct SafekeeperPostgresHandler {
     pub tenant_id: Option<TenantId>,
     pub timeline_id: Option<TimelineId>,
     pub ttid: TenantTimelineId,
-    auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 }
 
@@ -107,6 +105,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
+            .conf
             .auth
             .as_ref()
             .unwrap()
@@ -166,14 +165,13 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
 }
 
 impl SafekeeperPostgresHandler {
-    pub fn new(conf: SafeKeeperConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    pub fn new(conf: SafeKeeperConf) -> Self {
         SafekeeperPostgresHandler {
             conf,
             appname: None,
             tenant_id: None,
             timeline_id: None,
             ttid: TenantTimelineId::empty(),
-            auth,
             claims: None,
         }
     }
@@ -181,7 +179,7 @@ impl SafekeeperPostgresHandler {
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
     fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
-        if self.auth.is_none() {
+        if self.conf.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
         }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a9a9eb3388..a917d61678 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -277,12 +277,9 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
 }
 
 /// Safekeeper http router.
-pub fn make_router(
-    conf: SafeKeeperConf,
-    auth: Option<Arc<JwtAuth>>,
-) -> RouterBuilder<hyper::Body, ApiError> {
+pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
-    if auth.is_some() {
+    if conf.auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             #[allow(clippy::mutable_key_type)]
             static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -298,6 +295,7 @@ pub fn make_router(
 
     // NB: on any changes do not forget to update the OpenAPI spec
     // located nearby (/safekeeper/src/http/openapi_spec.yaml).
+    let auth = conf.auth.clone();
     router
         .data(Arc::new(conf))
         .data(auth)
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 5decfe64de..891d73533f 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -24,7 +24,9 @@ pub mod wal_service;
 pub mod wal_storage;
 
 mod timelines_global_map;
+use std::sync::Arc;
 pub use timelines_global_map::GlobalTimelines;
+use utils::auth::JwtAuth;
 
 pub mod defaults {
     pub use safekeeper_api::{
@@ -57,7 +59,7 @@ pub struct SafeKeeperConf {
     pub max_offloader_lag_bytes: u64,
     pub backup_runtime_threads: Option<usize>,
     pub wal_backup_enabled: bool,
-    pub auth_validation_public_key_path: Option<PathBuf>,
+    pub auth: Option<Arc<JwtAuth>>,
 }
 
 impl SafeKeeperConf {
@@ -87,7 +89,7 @@ impl SafeKeeperConf {
             broker_keepalive_interval: Duration::from_secs(5),
             backup_runtime_threads: None,
             wal_backup_enabled: true,
-            auth_validation_public_key_path: None,
+            auth: None,
             heartbeat_timeout: Duration::new(5, 0),
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
         }
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index fd8f9d9dcf..0fea00fe1b 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -5,32 +5,25 @@
 use anyhow::Result;
 use regex::Regex;
 use std::net::{TcpListener, TcpStream};
-use std::sync::Arc;
 use std::thread;
 use tracing::*;
-use utils::auth::JwtAuth;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::SafeKeeperConf;
 use utils::postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(
-    conf: SafeKeeperConf,
-    listener: TcpListener,
-    auth: Option<Arc<JwtAuth>>,
-) -> Result<()> {
+pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
     loop {
         match listener.accept() {
             Ok((socket, peer_addr)) => {
                 debug!("accepted connection from {}", peer_addr);
                 let conf = conf.clone();
 
-                let auth = auth.clone();
                 let _ = thread::Builder::new()
                     .name("WAL service thread".into())
                     .spawn(move || {
-                        if let Err(err) = handle_socket(socket, conf, auth) {
+                        if let Err(err) = handle_socket(socket, conf) {
                             error!("connection handler exited: {}", err);
                         }
                     })
@@ -51,25 +44,17 @@ fn get_tid() -> u64 {
 
 /// This is run by `thread_main` above, inside a background thread.
 ///
-fn handle_socket(
-    socket: TcpStream,
-    conf: SafeKeeperConf,
-    auth: Option<Arc<JwtAuth>>,
-) -> Result<()> {
+fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
     let _enter = info_span!("", tid = ?get_tid()).entered();
 
     socket.set_nodelay(true)?;
 
-    let mut conn_handler = SafekeeperPostgresHandler::new(conf, auth.clone());
-    let pgbackend = PostgresBackend::new(
-        socket,
-        match auth {
-            None => AuthType::Trust,
-            Some(_) => AuthType::NeonJWT,
-        },
-        None,
-        false,
-    )?;
+    let auth_type = match conf.auth {
+        None => AuthType::Trust,
+        Some(_) => AuthType::NeonJWT,
+    };
+    let mut conn_handler = SafekeeperPostgresHandler::new(conf);
+    let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?;
     // libpq replication protocol between safekeeper and replicas/pagers
     pgbackend.run(&mut conn_handler)?;
 

From 9f94d098aa7e843428085acc8cf80550ad35219a Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Fri, 16 Dec 2022 20:18:15 +0200
Subject: [PATCH 092/132] Remove unused AuthType::MD5

---
 libs/utils/src/postgres_backend.rs       | 26 ------------------------
 libs/utils/src/postgres_backend_async.rs | 23 ---------------------
 pageserver/src/bin/pageserver.rs         |  2 +-
 3 files changed, 1 insertion(+), 50 deletions(-)

diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
index 5b34c7adfb..bac6f861c3 100644
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -7,7 +7,6 @@ use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::io::{self, Write};
@@ -33,11 +32,6 @@ pub trait Handler {
         Ok(())
     }
 
-    /// Check auth md5
-    fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
-        bail!("MD5 auth failed")
-    }
-
     /// Check auth jwt
     fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
         bail!("JWT auth failed")
@@ -61,7 +55,6 @@ pub enum ProtoState {
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub enum AuthType {
     Trust,
-    MD5,
     // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
     NeonJWT,
 }
@@ -72,7 +65,6 @@ impl FromStr for AuthType {
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         match s {
             "Trust" => Ok(Self::Trust),
-            "MD5" => Ok(Self::MD5),
             "NeonJWT" => Ok(Self::NeonJWT),
             _ => bail!("invalid value \"{s}\" for auth type"),
         }
@@ -83,7 +75,6 @@ impl fmt::Display for AuthType {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.write_str(match self {
             AuthType::Trust => "Trust",
-            AuthType::MD5 => "MD5",
             AuthType::NeonJWT => "NeonJWT",
         })
     }
@@ -134,7 +125,6 @@ pub struct PostgresBackend {
 
     pub state: ProtoState,
 
-    md5_salt: [u8; 4],
     auth_type: AuthType,
 
     peer_addr: SocketAddr,
@@ -187,7 +177,6 @@ impl PostgresBackend {
             stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
             buf_out: BytesMut::with_capacity(10 * 1024),
             state: ProtoState::Initialization,
-            md5_salt: [0u8; 4],
             auth_type,
             tls_config,
             peer_addr,
@@ -367,13 +356,6 @@ impl PostgresBackend {
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
-                            AuthType::MD5 => {
-                                rand::thread_rng().fill(&mut self.md5_salt);
-                                self.write_message(&BeMessage::AuthenticationMD5Password(
-                                    self.md5_salt,
-                                ))?;
-                                self.state = ProtoState::Authentication;
-                            }
                             AuthType::NeonJWT => {
                                 self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
                                 self.state = ProtoState::Authentication;
@@ -393,14 +375,6 @@ impl PostgresBackend {
 
                 match self.auth_type {
                     AuthType::Trust => unreachable!(),
-                    AuthType::MD5 => {
-                        let (_, md5_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_md5(self, md5_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
-                        }
-                    }
                     AuthType::NeonJWT => {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index a22774c69e..dc93131b61 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -7,7 +7,6 @@ use crate::postgres_backend::AuthType;
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
-use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
 use std::pin::Pin;
@@ -35,11 +34,6 @@ pub trait Handler {
         Ok(())
     }
 
-    /// Check auth md5
-    fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
-        bail!("MD5 auth failed")
-    }
-
     /// Check auth jwt
     fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
         bail!("JWT auth failed")
@@ -125,7 +119,6 @@ pub struct PostgresBackend {
 
     pub state: ProtoState,
 
-    md5_salt: [u8; 4],
     auth_type: AuthType,
 
     peer_addr: SocketAddr,
@@ -160,7 +153,6 @@ impl PostgresBackend {
             stream: Stream::Unencrypted(BufReader::new(socket)),
             buf_out: BytesMut::with_capacity(10 * 1024),
             state: ProtoState::Initialization,
-            md5_salt: [0u8; 4],
             auth_type,
             tls_config,
             peer_addr,
@@ -337,13 +329,6 @@ impl PostgresBackend {
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
-                            AuthType::MD5 => {
-                                rand::thread_rng().fill(&mut self.md5_salt);
-                                self.write_message(&BeMessage::AuthenticationMD5Password(
-                                    self.md5_salt,
-                                ))?;
-                                self.state = ProtoState::Authentication;
-                            }
                             AuthType::NeonJWT => {
                                 self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
                                 self.state = ProtoState::Authentication;
@@ -364,14 +349,6 @@ impl PostgresBackend {
 
                 match self.auth_type {
                     AuthType::Trust => unreachable!(),
-                    AuthType::MD5 => {
-                        let (_, md5_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_md5(self, md5_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
-                        }
-                    }
                     AuthType::NeonJWT => {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index b3d9b0f809..a124bf85c2 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -255,7 +255,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
 
     // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
-        AuthType::Trust | AuthType::MD5 => None,
+        AuthType::Trust => None,
         AuthType::NeonJWT => {
             // unwrap is ok because check is performed when creating config, so path is set and file exists
             let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();

From 3468db8a2beed8977c29597af4c58286c6f9f0ff Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Jan 2023 08:47:28 +0100
Subject: [PATCH 093/132] Bump setuptools from 65.5.0 to 65.5.1 (#3212)

---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1b04230cef..edbcddd576 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1418,7 +1418,7 @@ pbr = "*"
 
 [[package]]
 name = "setuptools"
-version = "65.5.0"
+version = "65.5.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 category = "main"
 optional = false
@@ -1426,7 +1426,7 @@ python-versions = ">=3.7"
 
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
@@ -2283,8 +2283,8 @@ sarif-om = [
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
 setuptools = [
-    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
-    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+    {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
+    {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
 ]
 six = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},

From 81afd7011c512db1114063ae568feba1af7c3125 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 31 Dec 2022 02:45:36 +0200
Subject: [PATCH 094/132] Use rustls for everything.

I looked at "cargo tree" output and noticed that through various
dependencies, we are depending on both native-tls and rustls. We have
tried to standardize on rustls for everything, but dependencies on
native-tls have crept in recently. One such dependency came from
'reqwest' with default features in pageserver, used for
consumption_metrics. Another dependency was from 'sentry'. Both
'reqwest' and 'sentry' use native-tls by default, but can use 'rustls'
if compiled with the right feature flags.
---
 Cargo.lock                | 119 ++------------------------------------
 libs/utils/Cargo.toml     |   2 +-
 pageserver/Cargo.toml     |   2 +-
 workspace_hack/Cargo.toml |   2 +-
 4 files changed, 8 insertions(+), 117 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2737a4d934..4daeef1f06 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1342,21 +1342,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -1757,19 +1742,6 @@ dependencies = [
  "tokio-io-timeout",
 ]
 
-[[package]]
-name = "hyper-tls"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
-dependencies = [
- "bytes",
- "hyper",
- "native-tls",
- "tokio",
- "tokio-native-tls",
-]
-
 [[package]]
 name = "iana-time-zone"
 version = "0.1.53"
@@ -2141,24 +2113,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
-[[package]]
-name = "native-tls"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
 [[package]]
 name = "nix"
 version = "0.23.2"
@@ -2305,51 +2259,12 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
-[[package]]
-name = "openssl"
-version = "0.10.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29d971fd5722fec23977260f6e81aa67d2f22cadbdc2aa049f1022d9a3be1566"
-dependencies = [
- "bitflags",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
-[[package]]
-name = "openssl-sys"
-version = "0.9.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5454462c0eced1e97f2ec09036abc8da362e66802f66fd20f86854d9d8cbcbc4"
-dependencies = [
- "autocfg",
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "os_info"
 version = "3.5.1"
@@ -2583,12 +2498,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
-[[package]]
-name = "pkg-config"
-version = "0.3.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
-
 [[package]]
 name = "plotters"
 version = "0.3.4"
@@ -3095,12 +3004,10 @@ dependencies = [
  "http-body",
  "hyper",
  "hyper-rustls",
- "hyper-tls",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -3110,7 +3017,6 @@ dependencies = [
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-native-tls",
  "tokio-rustls",
  "tower-service",
  "url",
@@ -3423,15 +3329,14 @@ version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
 dependencies = [
- "httpdate",
- "native-tls",
  "reqwest",
+ "rustls",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
  "sentry-panic",
- "tokio",
  "ureq",
+ "webpki-roots",
 ]
 
 [[package]]
@@ -4004,16 +3909,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "tokio-native-tls"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
-dependencies = [
- "native-tls",
- "tokio",
-]
-
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
@@ -4362,9 +4257,11 @@ dependencies = [
  "base64 0.13.1",
  "chunked_transfer",
  "log",
- "native-tls",
  "once_cell",
+ "rustls",
  "url",
+ "webpki",
+ "webpki-roots",
 ]
 
 [[package]]
@@ -4447,12 +4344,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 47639e8205..9324a862b4 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-sentry = "0.29.0"
+sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls" ] }
 async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index f5acfcbdc0..cd12ee0cc9 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -69,7 +69,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
 tenant_size_model = { path = "../libs/tenant_size_model" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
-reqwest = "0.11.13"
+reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 6c81756fe1..e36075921f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,7 +37,7 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
+reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }

From 41b8e673052e55a77b1403f3eac087446c2dbc38 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 2 Jan 2023 14:50:51 +0400
Subject: [PATCH 095/132] Fix 81afd7011 by enabling reqwest feature for sentry.

It disabled transport altogether.
---
 Cargo.lock                | 3 ++-
 libs/utils/Cargo.toml     | 2 +-
 workspace_hack/Cargo.toml | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4daeef1f06..46170717d6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3329,12 +3329,14 @@ version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
 dependencies = [
+ "httpdate",
  "reqwest",
  "rustls",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
  "sentry-panic",
+ "tokio",
  "ureq",
  "webpki-roots",
 ]
@@ -4666,7 +4668,6 @@ dependencies = [
  "rand",
  "regex",
  "regex-syntax",
- "reqwest",
  "scopeguard",
  "serde",
  "serde_json",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 9324a862b4..670270b63e 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls" ] }
+sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e36075921f..4c7fbd8333 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,7 +37,6 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }

From 56a4466d0a85a9498bfd2a78a4ad3a2facb58167 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 2 Jan 2023 14:34:06 +0200
Subject: [PATCH 096/132] Run Python tests in 8 threads (#3206)

I have experimented with the runner threads number, and looks like 8
threads win us a few seconds.

Bumping the thread count more did not improve the situation much:
* 20 threads were not allowed by pytest
* 16 threads were flacking quite notably

My guess would be that all pageservers, safekeepers, and other nodes we
start occupy quite much of the CPU and other resources to make this
approach more scalable.
---
 .github/actions/run-python-test-set/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 990c7e25a9..95167ecf6c 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n4 uses four processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+          # -n8 uses eight processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n8 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist

From 6fd64cd5f67fbc6cfc8286138004e900a28e4d3b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 2 Jan 2023 16:03:26 +0400
Subject: [PATCH 097/132] Allow failure to report metrics in
 test_metric_collection.

Per CI
https://github.com/neondatabase/neon/actions/runs/3822039946/attempts/1
shutdown seems to be racy.
---
 test_runner/regress/test_metric_collection.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index ac9f163801..0fff86f268 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -107,6 +107,9 @@ def test_metric_collection(
 
     # spin up neon,  after http server is ready
     env = neon_env_builder.init_start()
+    # Order of fixtures shutdown is not specified, and if http server gets down
+    # before pageserver, pageserver log might contain such errors in the end.
+    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
     env.neon_cli.create_branch("test_metric_collection")
     pg = env.postgres.create_start("test_metric_collection")
 

From a9cca7a0fd7c7585334ae9e5cdb3f13b20db324a Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 2 Jan 2023 16:51:05 +0200
Subject: [PATCH 098/132] Use proper error code for BeMessage error responses
 (#3240)

Based on
https://github.com/neondatabase/neon/pull/3227#discussion_r1059430067

Seems that the constant, used for internal error during BeMessage error
response serialization is incorrect.
Currently used one is `CXX000`, yet all docs mention `XX000` instead:

* https://www.postgresql.org/docs/current/errcodes-appendix.html
* https://docs.rs/postgres/latest/postgres/error/struct.SqlState.html#associatedconstant.INTERNAL_ERROR

I have checked it with the patch and logs described in
https://github.com/neondatabase/neon/pull/3227#discussion_r1059949982
---
 libs/pq_proto/src/lib.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 278f044c15..d31a2d51f2 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -626,6 +626,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
     Ok(result)
 }
 
+const SQLSTATE_INTERNAL_ERROR: &str = "XX000\0";
+
 impl<'a> BeMessage<'a> {
     /// Write message to the given buf.
     // Unlike the reading side, we use BytesMut
@@ -776,7 +778,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"ERROR\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(b"CXX000\0");
+                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg, buf)?;
@@ -799,7 +801,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"NOTICE\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(b"CXX000\0");
+                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg.as_bytes(), buf)?;

From 182dc785d6d5af3ef91e83f94688d71f9652175f Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 2 Jan 2023 18:05:23 +0200
Subject: [PATCH 099/132] Set PITR  default to 7 days (#3245)

https://github.com/neondatabase/cloud/issues/3406
---
 pageserver/src/tenant/config.rs         | 2 +-
 test_runner/regress/test_tenant_conf.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 8569c70217..c95a98fbc7 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -30,7 +30,7 @@ pub mod defaults {
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
     pub const DEFAULT_GC_PERIOD: &str = "100 s";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
+    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 6d621fbb77..29cdcb18ce 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -59,7 +59,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 100,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,  # 7 days
                 }.items()
             )
 
@@ -79,7 +79,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 30,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 
@@ -107,7 +107,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 80,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 
@@ -130,7 +130,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 80,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 

From 4c4d3dc87a731734881ed7a88d535f89f02b046f Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 2 Jan 2023 22:14:05 +0400
Subject: [PATCH 100/132] Add new pageserver to us-east-2 staging (#3248)

---
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 11c7992444..1d1b8dbfa4 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -27,6 +27,8 @@ storage:
           ansible_host: i-0c3e70929edb5d691
         pageserver-1.us-east-2.aws.neon.build:
           ansible_host: i-0565a8b4008aa3f40
+        pageserver-2.us-east-2.aws.neon.build:
+          ansible_host: i-01e31cdf7e970586a
 
     safekeepers:
       hosts:

From 5bc9f8eae01deffc0d8cc95dfae482b922bb773e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Jan 2023 16:54:39 +0100
Subject: [PATCH 101/132] README: Fedora needs protobuf-devel

Otherwise, common protobufs such as Google's empty.proto are missing,
resulting in storage_broker build.rs failure.

I encountered this on Fedora 36.
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 30bde949a9..fa5c1626e4 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
-  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
+  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
+  protobuf-devel
 ```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)

From 0a0e55c3d08ce8b6956b9235888946d4ff97a4f7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 3 Jan 2023 12:39:11 +0200
Subject: [PATCH 102/132] Replace 'tar' crate with 'tokio-tar' (#3202)

The synchronous 'tar' crate has required us to use block_in_place and
SyncIoBridge to work together with the async I/O in the client
connection. Switch to 'tokio-tar' crate that uses async I/O natively.

As part of this, move the CopyDataWriter implementation to
postgres_backend_async.rs. Even though it's only used in one place
currently, it's in principle generally applicable whenever you want to
use COPY out.

Unfortunately we cannot use the 'tokio-tar' as it is: the Builder
implementation requires the writer to have 'static lifetime. So we
have to use a modified version without that requirement. The 'static
lifetime was required just for the Drop implementation that writes
the end-of-archive sections if the Builder is dropped without calling
`finish`. But we don't actually want that behavior anyway; in fact
we had to jump through some hoops with the AbortableWrite hack to skip
those. With the modified version of 'tokio-tar' without that Drop
implementation, we don't need AbortableWrite either.

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
---
 Cargo.lock                               |  16 +-
 libs/utils/src/postgres_backend_async.rs | 105 ++++++++++-
 pageserver/Cargo.toml                    |   2 +-
 pageserver/src/basebackup.rs             | 229 +++++++++--------------
 pageserver/src/import_datadir.rs         | 103 +++++-----
 pageserver/src/page_service.rs           |  52 +----
 pageserver/src/tenant.rs                 |  32 ++--
 test_runner/regress/test_config.py       |   0
 8 files changed, 278 insertions(+), 261 deletions(-)
 mode change 100644 => 100755 test_runner/regress/test_config.py

diff --git a/Cargo.lock b/Cargo.lock
index 46170717d6..ad1fc67219 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2339,12 +2339,12 @@ dependencies = [
  "signal-hook",
  "storage_broker",
  "svg_fmt",
- "tar",
  "tempfile",
  "tenant_size_model",
  "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-tar",
  "tokio-util",
  "toml_edit",
  "tracing",
@@ -3970,6 +3970,20 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tokio-tar"
+version = "0.3.0"
+source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
+dependencies = [
+ "filetime",
+ "futures-core",
+ "libc",
+ "redox_syscall",
+ "tokio",
+ "tokio-stream",
+ "xattr",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.4"
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index dc93131b61..de547c3242 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -5,7 +5,7 @@
 
 use crate::postgres_backend::AuthType;
 use anyhow::{bail, Context, Result};
-use bytes::{Bytes, BytesMut};
+use bytes::{Buf, Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use std::future::Future;
 use std::net::SocketAddr;
@@ -114,7 +114,10 @@ impl AsyncRead for Stream {
 
 pub struct PostgresBackend {
     stream: Stream,
+
     // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
+    // The data between 0 and "current position" as tracked by the bytes::Buf
+    // implementation of BytesMut, have already been written.
     buf_out: BytesMut,
 
     pub state: ProtoState,
@@ -174,10 +177,13 @@ impl PostgresBackend {
     }
 
     /// Flush output buffer into the socket.
-    pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
-        self.stream.write_all(&self.buf_out).await?;
+    pub async fn flush(&mut self) -> std::io::Result<()> {
+        while self.buf_out.has_remaining() {
+            let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
+            self.buf_out.advance(bytes_written);
+        }
         self.buf_out.clear();
-        Ok(self)
+        Ok(())
     }
 
     /// Write message into internal output buffer.
@@ -186,6 +192,36 @@ impl PostgresBackend {
         Ok(self)
     }
 
+    /// Returns an AsyncWrite implementation that wraps all the data written
+    /// to it in CopyData messages, and writes them to the connection
+    ///
+    /// The caller is responsible for sending CopyOutResponse and CopyDone messages.
+    pub fn copyout_writer(&mut self) -> CopyDataWriter {
+        CopyDataWriter { pgb: self }
+    }
+
+    /// A polling function that tries to write all the data from 'buf_out' to the
+    /// underlying stream.
+    fn poll_write_buf(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        while self.buf_out.has_remaining() {
+            match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
+                Poll::Ready(Ok(bytes_written)) => {
+                    self.buf_out.advance(bytes_written);
+                }
+                Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                Poll::Pending => return Poll::Pending,
+            }
+        }
+        Poll::Ready(Ok(()))
+    }
+
+    fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
+        Pin::new(&mut self.stream).poll_flush(cx)
+    }
+
     // Wrapper for run_message_loop() that shuts down socket when we are done
     pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
     where
@@ -458,3 +494,64 @@ impl PostgresBackend {
         Ok(ProcessMsgResult::Continue)
     }
 }
+
+///
+/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
+/// messages.
+///
+
+pub struct CopyDataWriter<'a> {
+    pgb: &'a mut PostgresBackend,
+}
+
+impl<'a> AsyncWrite for CopyDataWriter<'a> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, std::io::Error>> {
+        let this = self.get_mut();
+
+        // It's not strictly required to flush between each message, but makes it easier
+        // to view in wireshark, and usually the messages that the callers write are
+        // decently-sized anyway.
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+
+        // CopyData
+        // XXX: if the input is large, we should split it into multiple messages.
+        // Not sure what the threshold should be, but the ultimate hard limit is that
+        // the length cannot exceed u32.
+        this.pgb.write_message(&BeMessage::CopyData(buf))?;
+
+        Poll::Ready(Ok(buf.len()))
+    }
+
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+        this.pgb.poll_flush(cx)
+    }
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+        this.pgb.poll_flush(cx)
+    }
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index cd12ee0cc9..c0f3c76c4e 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,7 +49,7 @@ serde_json = { version = "1.0", features = ["raw_value"] }
 serde_with = "2.0"
 signal-hook = "0.3.10"
 svg_fmt = "0.4.1"
-tar = "0.4.33"
+tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 36664e119e..e537048489 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,17 +13,22 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
-use itertools::Itertools;
 use std::fmt::Write as FmtWrite;
-use std::io;
-use std::io::Write;
 use std::sync::Arc;
 use std::time::SystemTime;
-use tar::{Builder, EntryType, Header};
+use tokio::io;
+use tokio::io::AsyncWrite;
 use tracing::*;
 
-use crate::task_mgr;
-use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
+/// NB: This relies on a modified version of tokio_tar that does *not* write the
+/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
+/// without explicitly calling 'finish' or 'into_inner'!
+///
+/// See https://github.com/neondatabase/tokio-tar/pull/1
+///
+use tokio_tar::{Builder, EntryType, Header};
+
+use crate::tenant::{with_ondemand_download, Timeline};
 use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -39,14 +44,13 @@ use utils::lsn::Lsn;
 /// used for constructing tarball.
 pub struct Basebackup<'a, W>
 where
-    W: Write,
+    W: AsyncWrite + Send + Sync + Unpin,
 {
-    ar: Builder<AbortableWrite<W>>,
+    ar: Builder<&'a mut W>,
     timeline: &'a Arc<Timeline>,
     pub lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
-    finished: bool,
 }
 
 // Create basebackup with non-rel data in it.
@@ -59,10 +63,10 @@ where
 //    to start the replication.
 impl<'a, W> Basebackup<'a, W>
 where
-    W: Write,
+    W: AsyncWrite + Send + Sync + Unpin,
 {
     pub fn new(
-        write: W,
+        write: &'a mut W,
         timeline: &'a Arc<Timeline>,
         req_lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
@@ -117,22 +121,21 @@ where
         );
 
         Ok(Basebackup {
-            ar: Builder::new(AbortableWrite::new(write)),
+            ar: Builder::new_non_terminated(write),
             timeline,
             lsn: backup_lsn,
             prev_record_lsn: prev_lsn,
             full_backup,
-            finished: false,
         })
     }
 
-    pub fn send_tarball(mut self) -> anyhow::Result<()> {
+    pub async fn send_tarball(mut self) -> anyhow::Result<()> {
         // TODO include checksum
 
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
             let header = new_tar_header_dir(dir)?;
-            self.ar.append(&header, &mut io::empty())?;
+            self.ar.append(&header, &mut io::empty()).await?;
         }
 
         // Send empty config files.
@@ -140,10 +143,10 @@ where
             if *filepath == "pg_hba.conf" {
                 let data = PG_HBA.as_bytes();
                 let header = new_tar_header(filepath, data.len() as u64)?;
-                self.ar.append(&header, data)?;
+                self.ar.append(&header, data).await?;
             } else {
                 let header = new_tar_header(filepath, 0)?;
-                self.ar.append(&header, &mut io::empty())?;
+                self.ar.append(&header, &mut io::empty()).await?;
             }
         }
 
@@ -154,29 +157,30 @@ where
             SlruKind::MultiXactMembers,
         ] {
             for segno in
-                with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
+                with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await?
             {
-                self.add_slru_segment(kind, segno)?;
+                self.add_slru_segment(kind, segno).await?;
             }
         }
 
         // Create tablespace directories
         for ((spcnode, dbnode), has_relmap_file) in
-            with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
+            with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await?
         {
-            self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
+            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in with_ondemand_download_sync(|| {
-                    self.timeline.list_rels(spcnode, dbnode, self.lsn)
-                })? {
-                    self.add_rel(rel)?;
+                for rel in
+                    with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
+                        .await?
+                {
+                    self.add_rel(rel).await?;
                 }
             }
         }
-        for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
-            self.add_twophase_file(xid)?;
+        for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? {
+            self.add_twophase_file(xid).await?;
         }
 
         fail_point!("basebackup-before-control-file", |_| {
@@ -184,36 +188,32 @@ where
         });
 
         // Generate pg_control and bootstrap WAL segment.
-        self.add_pgcontrol_file()?;
-        self.ar.finish()?;
-        self.finished = true;
+        self.add_pgcontrol_file().await?;
+        self.ar.finish().await?;
         debug!("all tarred up!");
         Ok(())
     }
 
-    fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
+    async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
         let nblocks =
-            with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
-
-        // Function that adds relation segment data to archive
-        let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
-            let file_name = tag.to_segfile_name(segment_index as u32);
-            let header = new_tar_header(&file_name, data.len() as u64)?;
-            self.ar.append(&header, data.as_slice())?;
-            Ok(())
-        };
+            with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
-            add_file(0, &vec![])?;
+            let file_name = tag.to_segfile_name(0);
+            let header = new_tar_header(&file_name, 0)?;
+            self.ar.append(&header, &mut io::empty()).await?;
             return Ok(());
         }
 
         // Add a file for each chunk of blocks (aka segment)
-        let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize);
-        for (seg, blocks) in chunks.into_iter().enumerate() {
+        let mut startblk = 0;
+        let mut seg = 0;
+        while startblk < nblocks {
+            let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
+
             let mut segment_data: Vec<u8> = vec![];
-            for blknum in blocks {
+            for blknum in startblk..endblk {
                 let img = self
                     .timeline
                     .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
@@ -221,7 +221,12 @@ where
                 segment_data.extend_from_slice(&img[..]);
             }
 
-            add_file(seg, &segment_data)?;
+            let file_name = tag.to_segfile_name(seg as u32);
+            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
+            self.ar.append(&header, segment_data.as_slice()).await?;
+
+            seg += 1;
+            startblk = endblk;
         }
 
         Ok(())
@@ -230,17 +235,18 @@ where
     //
     // Generate SLRU segment files from repository.
     //
-    fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = with_ondemand_download_sync(|| {
-            self.timeline.get_slru_segment_size(slru, segno, self.lsn)
-        })?;
+    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
+        let nblocks =
+            with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn))
+                .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
-            let img = with_ondemand_download_sync(|| {
+            let img = with_ondemand_download(|| {
                 self.timeline
                     .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
-            })?;
+            })
+            .await?;
 
             if slru == SlruKind::Clog {
                 ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -253,7 +259,7 @@ where
 
         let segname = format!("{}/{:>04X}", slru.to_str(), segno);
         let header = new_tar_header(&segname, slru_buf.len() as u64)?;
-        self.ar.append(&header, slru_buf.as_slice())?;
+        self.ar.append(&header, slru_buf.as_slice()).await?;
 
         trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
         Ok(())
@@ -265,16 +271,16 @@ where
     // Each directory contains a PG_VERSION file, and the default database
     // directories also contain pg_filenode.map files.
     //
-    fn add_dbdir(
+    async fn add_dbdir(
         &mut self,
         spcnode: u32,
         dbnode: u32,
         has_relmap_file: bool,
     ) -> anyhow::Result<()> {
         let relmap_img = if has_relmap_file {
-            let img = with_ondemand_download_sync(|| {
-                self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
-            })?;
+            let img =
+                with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn))
+                    .await?;
             ensure!(img.len() == 512);
             Some(img)
         } else {
@@ -284,14 +290,14 @@ where
         if spcnode == GLOBALTABLESPACE_OID {
             let pg_version_str = self.timeline.pg_version.to_string();
             let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes())?;
+            self.ar.append(&header, pg_version_str.as_bytes()).await?;
 
             info!("timeline.pg_version {}", self.timeline.pg_version);
 
             if let Some(img) = relmap_img {
                 // filenode map for global tablespace
                 let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..])?;
+                self.ar.append(&header, &img[..]).await?;
             } else {
                 warn!("global/pg_filenode.map is missing");
             }
@@ -321,18 +327,18 @@ where
             // Append dir path for each database
             let path = format!("base/{}", dbnode);
             let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty())?;
+            self.ar.append(&header, &mut io::empty()).await?;
 
             if let Some(img) = relmap_img {
                 let dst_path = format!("base/{}/PG_VERSION", dbnode);
 
                 let pg_version_str = self.timeline.pg_version.to_string();
                 let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes())?;
+                self.ar.append(&header, pg_version_str.as_bytes()).await?;
 
                 let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                 let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..])?;
+                self.ar.append(&header, &img[..]).await?;
             }
         };
         Ok(())
@@ -341,8 +347,8 @@ where
     //
     // Extract twophase state files
     //
-    fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+        let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -350,7 +356,7 @@ where
         buf.put_u32_le(crc);
         let path = format!("pg_twophase/{:>08X}", xid);
         let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..])?;
+        self.ar.append(&header, &buf[..]).await?;
 
         Ok(())
     }
@@ -359,7 +365,7 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
@@ -371,17 +377,19 @@ where
         } else {
             write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
         }
-        self.ar.append(
-            &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-            zenith_signal.as_bytes(),
-        )?;
+        self.ar
+            .append(
+                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
+                zenith_signal.as_bytes(),
+            )
+            .await?;
 
-        let checkpoint_bytes =
-            with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
-                .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes =
-            with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
-                .context("failed get control bytes")?;
+        let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn))
+            .await
+            .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn))
+            .await
+            .context("failed get control bytes")?;
 
         let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
             &pg_control_bytes,
@@ -392,7 +400,7 @@ where
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..])?;
+        self.ar.append(&header, &pg_control_bytes[..]).await?;
 
         //send wal segment
         let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -404,24 +412,11 @@ where
             postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
                 .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
         ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
-        self.ar.append(&header, &wal_seg[..])?;
+        self.ar.append(&header, &wal_seg[..]).await?;
         Ok(())
     }
 }
 
-impl<'a, W> Drop for Basebackup<'a, W>
-where
-    W: Write,
-{
-    /// If the basebackup was not finished, prevent the Archive::drop() from
-    /// writing the end-of-archive marker.
-    fn drop(&mut self) {
-        if !self.finished {
-            self.ar.get_mut().abort();
-        }
-    }
-}
-
 //
 // Create new tarball entry header
 //
@@ -457,57 +452,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
     header.set_cksum();
     Ok(header)
 }
-
-/// A wrapper that passes through all data to the underlying Write,
-/// until abort() is called.
-///
-/// tar::Builder has an annoying habit of finishing the archive with
-/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
-/// even if an error occurs and we don't finish building the archive.
-/// We'd rather abort writing the tarball immediately than construct
-/// a seemingly valid but incomplete archive. This wrapper allows us
-/// to swallow the end-of-archive marker that Builder::drop() emits,
-/// without writing it to the underlying sink.
-///
-struct AbortableWrite<W> {
-    w: W,
-    aborted: bool,
-}
-
-impl<W> AbortableWrite<W> {
-    pub fn new(w: W) -> Self {
-        AbortableWrite { w, aborted: false }
-    }
-
-    pub fn abort(&mut self) {
-        self.aborted = true;
-    }
-}
-
-impl<W> Write for AbortableWrite<W>
-where
-    W: Write,
-{
-    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
-        if self.aborted {
-            Ok(data.len())
-        } else {
-            self.w.write(data)
-        }
-    }
-    fn flush(&mut self) -> io::Result<()> {
-        if self.aborted {
-            Ok(())
-        } else {
-            self.w.flush()
-        }
-    }
-}
-
-fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
-where
-    F: Send + Fn() -> PageReconstructResult<T>,
-    T: Send,
-{
-    task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
-}
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 588b92c13f..bac27f69de 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,12 +2,13 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::fs::File;
-use std::io::{Read, Seek, SeekFrom};
 use std::path::{Path, PathBuf};
 
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
+use futures::StreamExt;
+use tokio::io::{AsyncRead, AsyncReadExt};
+use tokio_tar::Archive;
 use tracing::*;
 use walkdir::WalkDir;
 
@@ -42,7 +43,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// This is currently only used to import a cluster freshly created by initdb.
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
-pub fn import_timeline_from_postgres_datadir(
+pub async fn import_timeline_from_postgres_datadir(
     tline: &Timeline,
     pgdata_path: &Path,
     pgdata_lsn: Lsn,
@@ -65,9 +66,11 @@ pub fn import_timeline_from_postgres_datadir(
             let absolute_path = entry.path();
             let relative_path = absolute_path.strip_prefix(pgdata_path)?;
 
-            let file = File::open(absolute_path)?;
+            let mut file = tokio::fs::File::open(absolute_path).await?;
             let len = metadata.len() as usize;
-            if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
+            if let Some(control_file) =
+                import_file(&mut modification, relative_path, &mut file, len).await?
+            {
                 pg_control = Some(control_file);
             }
             modification.flush()?;
@@ -102,12 +105,12 @@ pub fn import_timeline_from_postgres_datadir(
 }
 
 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
-fn import_rel<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_rel(
+    modification: &mut DatadirModification<'_>,
     path: &Path,
     spcoid: Oid,
     dboid: Oid,
-    mut reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> anyhow::Result<()> {
     // Does it look like a relation file?
@@ -148,7 +151,7 @@ fn import_rel<Reader: Read>(
     }
 
     loop {
-        let r = reader.read_exact(&mut buf);
+        let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
                 modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
@@ -181,11 +184,11 @@ fn import_rel<Reader: Read>(
 
 /// Import an SLRU segment file
 ///
-fn import_slru<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_slru(
+    modification: &mut DatadirModification<'_>,
     slru: SlruKind,
     path: &Path,
-    mut reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> anyhow::Result<()> {
     info!("importing slru file {path:?}");
@@ -206,7 +209,7 @@ fn import_slru<Reader: Read>(
 
     let mut rpageno = 0;
     loop {
-        let r = reader.read_exact(&mut buf);
+        let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
                 modification.put_slru_page_image(
@@ -243,6 +246,7 @@ fn import_wal(
     startpoint: Lsn,
     endpoint: Lsn,
 ) -> anyhow::Result<()> {
+    use std::io::Read;
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
@@ -265,10 +269,11 @@ fn import_wal(
         }
 
         // Slurp the WAL file
-        let mut file = File::open(&path)?;
+        let mut file = std::fs::File::open(&path)?;
 
         if offset > 0 {
-            file.seek(SeekFrom::Start(offset as u64))?;
+            use std::io::Seek;
+            file.seek(std::io::SeekFrom::Start(offset as u64))?;
         }
 
         let nread = file.read_to_end(&mut buf)?;
@@ -310,9 +315,9 @@ fn import_wal(
     Ok(())
 }
 
-pub fn import_basebackup_from_tar<Reader: Read>(
+pub async fn import_basebackup_from_tar(
     tline: &Timeline,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     base_lsn: Lsn,
 ) -> Result<()> {
     info!("importing base at {base_lsn}");
@@ -322,21 +327,24 @@ pub fn import_basebackup_from_tar<Reader: Read>(
     let mut pg_control: Option<ControlFileData> = None;
 
     // Import base
-    for base_tar_entry in tar::Archive::new(reader).entries()? {
-        let entry = base_tar_entry?;
+    let mut entries = Archive::new(reader).entries()?;
+    while let Some(base_tar_entry) = entries.next().await {
+        let mut entry = base_tar_entry?;
         let header = entry.header();
         let len = header.entry_size()? as usize;
         let file_path = header.path()?.into_owned();
 
         match header.entry_type() {
-            tar::EntryType::Regular => {
-                if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
+            tokio_tar::EntryType::Regular => {
+                if let Some(res) =
+                    import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+                {
                     // We found the pg_control file.
                     pg_control = Some(res);
                 }
                 modification.flush()?;
             }
-            tar::EntryType::Directory => {
+            tokio_tar::EntryType::Directory => {
                 debug!("directory {:?}", file_path);
             }
             _ => {
@@ -356,9 +364,9 @@ pub fn import_basebackup_from_tar<Reader: Read>(
     Ok(())
 }
 
-pub fn import_wal_from_tar<Reader: Read>(
+pub async fn import_wal_from_tar(
     tline: &Timeline,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     start_lsn: Lsn,
     end_lsn: Lsn,
 ) -> Result<()> {
@@ -371,16 +379,19 @@ pub fn import_wal_from_tar<Reader: Read>(
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
-    let mut pg_wal_tar = tar::Archive::new(reader);
-    let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
+    let mut pg_wal_tar = Archive::new(reader);
+    let mut pg_wal_entries = pg_wal_tar.entries()?;
     while last_lsn <= end_lsn {
         let bytes = {
-            let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
+            let mut entry = pg_wal_entries
+                .next()
+                .await
+                .ok_or_else(|| anyhow::anyhow!("expected more wal"))??;
             let header = entry.header();
             let file_path = header.path()?.into_owned();
 
             match header.entry_type() {
-                tar::EntryType::Regular => {
+                tokio_tar::EntryType::Regular => {
                     // FIXME: assume postgresql tli 1 for now
                     let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
                     let file_name = file_path
@@ -390,9 +401,9 @@ pub fn import_wal_from_tar<Reader: Read>(
                     ensure!(expected_filename == file_name);
 
                     debug!("processing wal file {:?}", file_path);
-                    read_all_bytes(entry)?
+                    read_all_bytes(&mut entry).await?
                 }
-                tar::EntryType::Directory => {
+                tokio_tar::EntryType::Directory => {
                     debug!("directory {:?}", file_path);
                     continue;
                 }
@@ -433,7 +444,7 @@ pub fn import_wal_from_tar<Reader: Read>(
     }
 
     // Log any extra unused files
-    for e in &mut pg_wal_entries_iter {
+    while let Some(e) = pg_wal_entries.next().await {
         let entry = e?;
         let header = entry.header();
         let file_path = header.path()?.into_owned();
@@ -443,10 +454,10 @@ pub fn import_wal_from_tar<Reader: Read>(
     Ok(())
 }
 
-fn import_file<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_file(
+    modification: &mut DatadirModification<'_>,
     file_path: &Path,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> Result<Option<ControlFileData>> {
     let file_name = match file_path.file_name() {
@@ -466,7 +477,7 @@ fn import_file<Reader: Read>(
 
         match file_name.as_ref() {
             "pg_control" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
 
                 // Extract the checkpoint record and import it separately.
                 let pg_control = ControlFileData::decode(&bytes[..])?;
@@ -479,7 +490,7 @@ fn import_file<Reader: Read>(
                 return Ok(Some(pg_control));
             }
             "pg_filenode.map" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
                 debug!("imported relmap file")
             }
@@ -487,7 +498,7 @@ fn import_file<Reader: Read>(
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
                 debug!("imported rel creation");
             }
         }
@@ -502,7 +513,7 @@ fn import_file<Reader: Read>(
 
         match file_name.as_ref() {
             "pg_filenode.map" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
                 debug!("imported relmap file")
             }
@@ -510,36 +521,36 @@ fn import_file<Reader: Read>(
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
                 debug!("imported rel creation");
             }
         }
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported clog slru");
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported multixact offsets slru");
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
         let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
-        let bytes = read_all_bytes(reader)?;
+        let bytes = read_all_bytes(reader).await?;
         modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
         debug!("found wal file in base section. ignore it");
     } else if file_path.starts_with("zenith.signal") {
         // Parse zenith signal file to set correct previous LSN
-        let bytes = read_all_bytes(reader)?;
+        let bytes = read_all_bytes(reader).await?;
         // zenith.signal format is "PREV LSN: prev_lsn"
         // TODO write serialization and deserialization in the same place.
         let zenith_signal = std::str::from_utf8(&bytes)?.trim();
@@ -576,8 +587,8 @@ fn import_file<Reader: Read>(
     Ok(None)
 }
 
-fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
+async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
     let mut buf: Vec<u8> = vec![];
-    reader.read_to_end(&mut buf)?;
+    reader.read_to_end(&mut buf).await?;
     Ok(Bytes::copy_from_slice(&buf[..]))
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b84b2694f4..5393fca780 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,9 +26,6 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::pin;
-use tokio_util::io::StreamReader;
-use tokio_util::io::SyncIoBridge;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
@@ -395,9 +392,7 @@ impl PageServerHandler {
         pgb.write_message(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
 
-        let copyin_stream = copyin_stream(pgb);
-        pin!(copyin_stream);
-
+        let mut copyin_stream = Box::pin(copyin_stream(pgb));
         timeline
             .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
             .await?;
@@ -443,8 +438,8 @@ impl PageServerHandler {
         pgb.write_message(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
+        let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
+        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -649,16 +644,14 @@ impl PageServerHandler {
         pgb.flush().await?;
 
         /* Send a tarball of the latest layer on the timeline */
-        let mut writer = CopyDataSink {
-            pgb,
-            rt: tokio::runtime::Handle::current(),
-        };
-        tokio::task::block_in_place(|| {
+        {
+            let mut writer = pgb.copyout_writer();
             let basebackup =
                 basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
             tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
-            basebackup.send_tarball()
-        })?;
+            basebackup.send_tarball().await?;
+        }
+
         pgb.write_message(&BeMessage::CopyDone)?;
         pgb.flush().await?;
         info!("basebackup complete");
@@ -966,32 +959,3 @@ async fn get_active_timeline_with_timeout(
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
 }
-
-///
-/// A std::io::Write implementation that wraps all data written to it in CopyData
-/// messages.
-///
-struct CopyDataSink<'a> {
-    pgb: &'a mut PostgresBackend,
-    rt: tokio::runtime::Handle,
-}
-
-impl<'a> io::Write for CopyDataSink<'a> {
-    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
-        // CopyData
-        // FIXME: if the input is large, we should split it into multiple messages.
-        // Not sure what the threshold should be, but the ultimate hard limit is that
-        // the length cannot exceed u32.
-        // FIXME: flush isn't really required, but makes it easier
-        // to view in wireshark
-        self.pgb.write_message(&BeMessage::CopyData(data))?;
-        self.rt.block_on(self.pgb.flush())?;
-        trace!("CopyData sent for {} bytes!", data.len());
-
-        Ok(data.len())
-    }
-    fn flush(&mut self) -> io::Result<()> {
-        // no-op
-        Ok(())
-    }
-}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4c93490177..dcaa8ea268 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,8 +18,6 @@ use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use tokio::sync::watch;
-use tokio_util::io::StreamReader;
-use tokio_util::io::SyncIoBridge;
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 
@@ -36,7 +34,6 @@ use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
-use std::pin::Pin;
 use std::process::Command;
 use std::process::Stdio;
 use std::sync::Arc;
@@ -236,21 +233,15 @@ impl UninitializedTimeline<'_> {
     /// Prepares timeline data by loading it from the basebackup archive.
     pub async fn import_basebackup_from_tar(
         self,
-        mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
+        copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
         base_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
-        // import_basebackup_from_tar() is not async, mainly because the Tar crate
-        // it uses is not async. So we need to jump through some hoops:
-        // - convert the input from client connection to a synchronous Read
-        // - use block_in_place()
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-
-        tokio::task::block_in_place(|| {
-            import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
-                .context("Failed to import basebackup")
-        })?;
+        let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
+        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+            .await
+            .context("Failed to import basebackup")?;
 
         // Flush loop needs to be spawned in order to be able to flush.
         // We want to run proper checkpoint before we mark timeline as available to outside world
@@ -2139,13 +2130,12 @@ impl Tenant {
         let tenant_id = raw_timeline.owning_tenant.tenant_id;
         let unfinished_timeline = raw_timeline.raw_timeline()?;
 
-        tokio::task::block_in_place(|| {
-            import_datadir::import_timeline_from_postgres_datadir(
-                unfinished_timeline,
-                pgdata_path,
-                pgdata_lsn,
-            )
-        })
+        import_datadir::import_timeline_from_postgres_datadir(
+            unfinished_timeline,
+            pgdata_path,
+            pgdata_lsn,
+        )
+        .await
         .with_context(|| {
             format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
         })?;
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
old mode 100644
new mode 100755

From 8b692e131bdb5010a784032cc5e399f15d256bd6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 3 Jan 2023 14:44:42 +0200
Subject: [PATCH 103/132] Enable on-demand download in WalIngest. (#3233)

Makes the top-level functions in WalIngest async, and replaces
no_ondemand_download calls with with_ondemand_download.

This hopefully fixes the problem reported in issue #3230, although I
don't have a self-contained test case for it.
---
 pageserver/src/basebackup.rs                  |  15 +-
 pageserver/src/import_datadir.rs              |  13 +-
 pageserver/src/walingest.rs                   | 332 +++++++++---------
 .../src/walreceiver/walreceiver_connection.rs |  21 +-
 4 files changed, 187 insertions(+), 194 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index e537048489..4052f13875 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -214,10 +214,11 @@ where
 
             let mut segment_data: Vec<u8> = vec![];
             for blknum in startblk..endblk {
-                let img = self
-                    .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
-                    .no_ondemand_download()?;
+                let img = with_ondemand_download(|| {
+                    self.timeline
+                        .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                })
+                .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
@@ -313,10 +314,8 @@ where
             // XLOG_TBLSPC_DROP records. But we probably should just
             // throw an error on CREATE TABLESPACE in the first place.
             if !has_relmap_file
-                && self
-                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)
-                    .no_ondemand_download()?
+                && with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
+                    .await?
                     .is_empty()
             {
                 return Ok(());
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index bac27f69de..ca1514dd00 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -99,7 +99,8 @@ pub async fn import_timeline_from_postgres_datadir(
         tline,
         Lsn(pg_control.checkPointCopy.redo),
         pgdata_lsn,
-    )?;
+    )
+    .await?;
 
     Ok(())
 }
@@ -240,7 +241,7 @@ async fn import_slru(
 
 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(
+async fn import_wal(
     walpath: &Path,
     tline: &Timeline,
     startpoint: Lsn,
@@ -253,7 +254,7 @@ fn import_wal(
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
+    let mut walingest = WalIngest::new(tline, startpoint).await?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -291,7 +292,7 @@ fn import_wal(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded)
-                    .no_ondemand_download()?;
+                    .await?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -375,7 +376,7 @@ pub async fn import_wal_from_tar(
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
+    let mut walingest = WalIngest::new(tline, start_lsn).await?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -425,7 +426,7 @@ pub async fn import_wal_from_tar(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded)
-                    .no_ondemand_download()?;
+                    .await?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 031b80a6e0..1c974f7e2a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,6 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
-use anyhow::Context;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -31,12 +30,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::pgdatadir_mapping::*;
-use crate::tenant::PageReconstructResult;
 use crate::tenant::Timeline;
-use crate::try_page_reconstruct_result as try_prr;
+use crate::tenant::{with_ondemand_download, PageReconstructError};
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
-use crate::{try_no_ondemand_download, try_page_reconstruct_result};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -55,16 +52,15 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub fn new(timeline: &Timeline, startpoint: Lsn) -> PageReconstructResult<WalIngest> {
+    pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = try_no_ondemand_download!(timeline.get_checkpoint(startpoint));
-        let checkpoint = try_page_reconstruct_result!(
-            CheckPoint::decode(&checkpoint_bytes).context("Failed to decode checkpoint bytes")
-        );
+        let checkpoint_bytes =
+            with_ondemand_download(|| timeline.get_checkpoint(startpoint)).await?;
+        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
-        PageReconstructResult::Success(WalIngest {
+        Ok(WalIngest {
             timeline,
             checkpoint,
             checkpoint_modified: false,
@@ -79,18 +75,15 @@ impl<'a> WalIngest<'a> {
     /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
     /// relations/pages that the record affects.
     ///
-    pub fn ingest_record(
+    pub async fn ingest_record(
         &mut self,
         recdata: Bytes,
         lsn: Lsn,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
-    ) -> PageReconstructResult<()> {
+    ) -> anyhow::Result<()> {
         modification.lsn = lsn;
-        try_prr!(
-            decode_wal_record(recdata, decoded, self.timeline.pg_version)
-                .context("failed decoding wal record")
-        );
+        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -105,7 +98,8 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded));
+            self.ingest_heapam_record(&mut buf, modification, decoded)
+                .await?;
         }
         // Handle other special record types
         if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -113,13 +107,14 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            try_prr!(self.ingest_xlog_smgr_create(modification, &create));
+            self.ingest_xlog_smgr_create(modification, &create)?;
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate));
+            self.ingest_xlog_smgr_truncate(modification, &truncate)
+                .await?;
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
                 "handle RM_DBASE_ID for Postgres version {:?}",
@@ -132,14 +127,15 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
+                    self.ingest_xlog_dbase_create(modification, &createdb)
+                        .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
+                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -155,14 +151,15 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
+                    self.ingest_xlog_dbase_create(modification, &createdb)
+                        .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
+                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
                     }
                 }
             }
@@ -174,38 +171,42 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                try_prr!(self.put_slru_page_image(
+                self.put_slru_page_image(
                     modification,
                     SlruKind::Clog,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                ));
+                )
+                .await?;
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                try_prr!(self.ingest_clog_truncate_record(modification, &xlrec));
+                self.ingest_clog_truncate_record(modification, &xlrec)
+                    .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
             let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
             if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                try_prr!(self.ingest_xact_record(
+                self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
-                ));
+                )
+                .await?;
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                 || info == pg_constants::XLOG_XACT_ABORT_PREPARED
             {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                try_prr!(self.ingest_xact_record(
+                self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                ));
+                )
+                .await?;
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
                 trace!(
                     "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
@@ -213,10 +214,9 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                try_prr!(modification.drop_twophase_file(parsed_xact.xid));
+                modification.drop_twophase_file(parsed_xact.xid)?;
             } else if info == pg_constants::XLOG_XACT_PREPARE {
-                try_prr!(modification
-                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..])));
+                modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -225,34 +225,36 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                try_prr!(self.put_slru_page_image(
+                self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactOffsets,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                ));
+                )
+                .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                try_prr!(self.put_slru_page_image(
+                self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactMembers,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                ));
+                )
+                .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                 let xlrec = XlMultiXactCreate::decode(&mut buf);
-                try_prr!(self.ingest_multixact_create_record(modification, &xlrec));
+                self.ingest_multixact_create_record(modification, &xlrec)?;
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec));
+                self.ingest_multixact_truncate_record(modification, &xlrec)?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded));
+            self.ingest_relmap_page(modification, &xlrec, decoded)?;
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
             if info == pg_constants::XLOG_NEXTOID {
@@ -266,9 +268,7 @@ impl<'a> WalIngest<'a> {
             {
                 let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
                 buf.copy_to_slice(&mut checkpoint_bytes);
-                let xlog_checkpoint = try_prr!(
-                    CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint")
-                );
+                let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
                 trace!(
                     "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
                     xlog_checkpoint.oldestXid,
@@ -289,32 +289,32 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk));
+            self.ingest_decoded_block(modification, lsn, decoded, blk)
+                .await?;
         }
 
         // If checkpoint data was updated, store the new version in the repository
         if self.checkpoint_modified {
-            let new_checkpoint_bytes =
-                try_prr!(self.checkpoint.encode().context("encode checkpoint"));
+            let new_checkpoint_bytes = self.checkpoint.encode()?;
 
-            try_prr!(modification.put_checkpoint(new_checkpoint_bytes));
+            modification.put_checkpoint(new_checkpoint_bytes)?;
             self.checkpoint_modified = false;
         }
 
         // Now that this record has been fully handled, including updating the
         // checkpoint data, let the repository know that it is up-to-date to this LSN
-        try_prr!(modification.commit());
+        modification.commit()?;
 
-        PageReconstructResult::Success(())
+        Ok(())
     }
 
-    fn ingest_decoded_block(
+    async fn ingest_decoded_block(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
-    ) -> PageReconstructResult<()> {
+    ) -> Result<(), PageReconstructError> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
             dbnode: blk.rnode_dbnode,
@@ -334,7 +334,7 @@ impl<'a> WalIngest<'a> {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
         // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version))
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
         {
             // Extract page image from FPI record
             let img_len = blk.bimg_len as usize;
@@ -356,28 +356,25 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            try_no_ondemand_download!(self.put_rel_page_image(
-                modification,
-                rel,
-                blk.blkno,
-                image.freeze()
-            ));
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
+                .await?;
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec));
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec)
+                .await?;
         }
-        PageReconstructResult::Success(())
+        Ok(())
     }
 
-    fn ingest_heapam_record(
+    async fn ingest_heapam_record(
         &mut self,
         buf: &mut Bytes,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Handle VM bit updates that are implicitly part of heap records.
 
         // First, look at the record to determine which VM bits need
@@ -456,7 +453,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -481,7 +478,8 @@ impl<'a> WalIngest<'a> {
                             old_heap_blkno,
                             flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                         },
-                    )?;
+                    )
+                    .await?;
                 } else {
                     // Clear VM bits for one heap page, or for two pages that reside on
                     // different VM pages.
@@ -495,7 +493,8 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno: None,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
-                        )?;
+                        )
+                        .await?;
                     }
                     if let Some(old_vm_blk) = old_vm_blk {
                         self.put_rel_wal_record(
@@ -507,7 +506,8 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
-                        )?;
+                        )
+                        .await?;
                     }
                 }
             }
@@ -517,9 +517,9 @@ impl<'a> WalIngest<'a> {
     }
 
     /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
-    fn ingest_xlog_dbase_create(
+    async fn ingest_xlog_dbase_create(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rec: &XlCreateDatabase,
     ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
@@ -534,18 +534,22 @@ impl<'a> WalIngest<'a> {
         // get calls instead.
         let req_lsn = modification.tline.get_last_record_lsn();
 
-        let rels = modification
-            .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)
-            .no_ondemand_download()?;
+        let rels = with_ondemand_download(|| {
+            modification
+                .tline
+                .list_rels(src_tablespace_id, src_db_id, req_lsn)
+        })
+        .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
 
         // Copy relfilemap
-        let filemap = modification
-            .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
-            .no_ondemand_download()?;
+        let filemap = with_ondemand_download(|| {
+            modification
+                .tline
+                .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+        })
+        .await?;
         modification.put_relmap_file(tablespace_id, db_id, filemap)?;
 
         let mut num_rels_copied = 0;
@@ -554,10 +558,9 @@ impl<'a> WalIngest<'a> {
             assert_eq!(src_rel.spcnode, src_tablespace_id);
             assert_eq!(src_rel.dbnode, src_db_id);
 
-            let nblocks = modification
-                .tline
-                .get_rel_size(src_rel, req_lsn, true)
-                .no_ondemand_download()?;
+            let nblocks =
+                with_ondemand_download(|| modification.tline.get_rel_size(src_rel, req_lsn, true))
+                    .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
                 dbnode: db_id,
@@ -572,10 +575,12 @@ impl<'a> WalIngest<'a> {
             for blknum in 0..nblocks {
                 debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
 
-                let content = modification
-                    .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
-                    .no_ondemand_download()?;
+                let content = with_ondemand_download(|| {
+                    modification
+                        .tline
+                        .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                })
+                .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
             }
@@ -594,7 +599,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification,
         rec: &XlSmgrCreate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let rel = RelTag {
             spcnode: rec.rnode.spcnode,
             dbnode: rec.rnode.dbnode,
@@ -608,11 +613,11 @@ impl<'a> WalIngest<'a> {
     /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
     ///
     /// This is the same logic as in PostgreSQL's smgr_redo() function.
-    fn ingest_xlog_smgr_truncate(
+    async fn ingest_xlog_smgr_truncate(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rec: &XlSmgrTruncate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let spcnode = rec.rnode.spcnode;
         let dbnode = rec.rnode.dbnode;
         let relnode = rec.rnode.relnode;
@@ -642,7 +647,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            let nblocks = self.get_relsize(rel, modification.lsn).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
@@ -663,7 +668,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            let nblocks = self.get_relsize(rel, modification.lsn).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no)?;
@@ -674,9 +679,9 @@ impl<'a> WalIngest<'a> {
 
     /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
     ///
-    fn ingest_xact_record(
+    async fn ingest_xact_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
     ) -> anyhow::Result<()> {
@@ -735,10 +740,8 @@ impl<'a> WalIngest<'a> {
                     relnode: xnode.relnode,
                 };
                 let last_lsn = self.timeline.get_last_record_lsn();
-                if modification
-                    .tline
-                    .get_rel_exists(rel, last_lsn, true)
-                    .no_ondemand_download()?
+                if with_ondemand_download(|| modification.tline.get_rel_exists(rel, last_lsn, true))
+                    .await?
                 {
                     self.put_rel_drop(modification, rel)?;
                 }
@@ -747,9 +750,9 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn ingest_clog_truncate_record(
+    async fn ingest_clog_truncate_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         xlrec: &XlClogTruncate,
     ) -> anyhow::Result<()> {
         info!(
@@ -791,11 +794,14 @@ impl<'a> WalIngest<'a> {
         // it. So we use the previous record's LSN in the get calls
         // instead.
         let req_lsn = modification.tline.get_last_record_lsn();
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)
-            .no_ondemand_download()?
-        {
+
+        let slru_segments = with_ondemand_download(|| {
+            modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, req_lsn)
+        })
+        .await?;
+        for segno in slru_segments {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                 modification.drop_slru_segment(SlruKind::Clog, segno)?;
@@ -944,27 +950,26 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn put_rel_page_image(
+    async fn put_rel_page_image(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> PageReconstructResult<()> {
-        try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum));
-        try_prr!(modification.put_rel_page_image(rel, blknum, img));
-        PageReconstructResult::Success(())
+    ) -> anyhow::Result<()> {
+        self.handle_rel_extend(modification, rel, blknum).await?;
+        modification.put_rel_page_image(rel, blknum, img)?;
+        Ok(())
     }
 
-    fn put_rel_wal_record(
+    async fn put_rel_wal_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)
-            .no_ondemand_download()?;
+    ) -> anyhow::Result<()> {
+        self.handle_rel_extend(modification, rel, blknum).await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -984,69 +989,67 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
-        let nblocks = if !self
-            .timeline
-            .get_rel_exists(rel, lsn, true)
-            .no_ondemand_download()?
-        {
+    async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
+        let exists =
+            with_ondemand_download(|| self.timeline.get_rel_exists(rel, lsn, true)).await?;
+        let nblocks = if !exists {
             0
         } else {
-            self.timeline
-                .get_rel_size(rel, lsn, true)
-                .no_ondemand_download()?
+            with_ondemand_download(|| self.timeline.get_rel_size(rel, lsn, true)).await?
         };
         Ok(nblocks)
     }
 
-    fn handle_rel_extend(
+    async fn handle_rel_extend(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> PageReconstructResult<()> {
+    ) -> anyhow::Result<()> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
         let old_nblocks =
-            if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) {
+            if !with_ondemand_download(|| self.timeline.get_rel_exists(rel, last_lsn, true)).await?
+            {
                 // create it with 0 size initially, the logic below will extend it
-                try_prr!(modification.put_rel_creation(rel, 0));
+                modification.put_rel_creation(rel, 0)?;
                 0
             } else {
-                try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true))
+                with_ondemand_download(|| self.timeline.get_rel_size(rel, last_lsn, true)).await?
             };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            try_prr!(modification.put_rel_extend(rel, new_nblocks));
+            modification.put_rel_extend(rel, new_nblocks)?;
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone()));
+                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
             }
         }
-        PageReconstructResult::Success(())
+        Ok(())
     }
 
-    fn put_slru_page_image(
+    async fn put_slru_page_image(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
-        self.handle_slru_extend(modification, kind, segno, blknum)?;
+    ) -> anyhow::Result<()> {
+        self.handle_slru_extend(modification, kind, segno, blknum)
+            .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;
         Ok(())
     }
 
-    fn handle_slru_extend(
+    async fn handle_slru_extend(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
@@ -1060,18 +1063,17 @@ impl<'a> WalIngest<'a> {
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = self.timeline.get_last_record_lsn();
-        let old_nblocks = if !self
-            .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)
-            .no_ondemand_download()?
+        let old_nblocks = if !with_ondemand_download(|| {
+            self.timeline.get_slru_segment_exists(kind, segno, last_lsn)
+        })
+        .await?
         {
             // create it with 0 size initially, the logic below will extend it
             modification.put_slru_segment_creation(kind, segno, 0)?;
             0
         } else {
-            self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn)
-                .no_ondemand_download()?
+            with_ondemand_download(|| self.timeline.get_slru_segment_size(kind, segno, last_lsn))
+                .await?
         };
 
         if new_nblocks > old_nblocks {
@@ -1119,12 +1121,12 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
+    async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10)).no_ondemand_download()?;
+        let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
 
         Ok(walingest)
     }
@@ -1133,28 +1135,28 @@ mod tests {
     async fn test_relsize() -> Result<()> {
         let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A)?;
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
 
         assert_current_logical_size(&tline, Lsn(0x50));
@@ -1292,7 +1294,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         assert_eq!(
             tline
@@ -1317,7 +1319,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         assert_eq!(
             tline
@@ -1349,12 +1351,12 @@ mod tests {
     async fn test_drop_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
@@ -1391,7 +1393,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
@@ -1418,7 +1420,7 @@ mod tests {
     async fn test_truncate_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1427,7 +1429,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
-                .no_ondemand_download()?;
+                .await?;
         }
         m.commit()?;
 
@@ -1519,7 +1521,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
-                .no_ondemand_download()?;
+                .await?;
         }
         m.commit()?;
 
@@ -1556,7 +1558,7 @@ mod tests {
     async fn test_large_rel() -> Result<()> {
         let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1565,7 +1567,7 @@ mod tests {
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
-                .no_ondemand_download()?;
+                .await?;
             m.commit()?;
         }
 
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index 3753807327..06aa132365 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -20,9 +20,7 @@ use tokio::{pin, select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{
-    metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
-};
+use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -175,8 +173,7 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest =
-        with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
 
     while let Some(replication_message) = {
         select! {
@@ -251,16 +248,10 @@ pub async fn handle_walreceiver_connection(
                         // at risk of hitting a deadlock.
                         ensure!(lsn.is_aligned());
 
-                        with_ondemand_download(|| {
-                            walingest.ingest_record(
-                                recdata.clone(),
-                                lsn,
-                                &mut modification,
-                                &mut decoded,
-                            )
-                        })
-                        .await
-                        .with_context(|| format!("could not ingest record at {lsn}"))?;
+                        walingest
+                            .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
+                            .await
+                            .with_context(|| format!("could not ingest record at {lsn}"))?;
 
                         fail_point!("walreceiver-after-ingest");
 

From 0b428f7c41679876a455505c1ed2dfb4d7dc03c0 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Fri, 30 Dec 2022 11:11:28 +0100
Subject: [PATCH 104/132] Enable licenses check for 3rd-parties

---
 .github/workflows/build_and_test.yml   |  6 ++
 compute_tools/Cargo.toml               |  1 +
 control_plane/Cargo.toml               |  1 +
 deny.toml                              | 90 ++++++++++++++++++++++++++
 libs/metrics/Cargo.toml                |  1 +
 libs/pageserver_api/Cargo.toml         |  1 +
 libs/postgres_connection/Cargo.toml    |  1 +
 libs/postgres_ffi/Cargo.toml           |  1 +
 libs/postgres_ffi/wal_craft/Cargo.toml |  2 +-
 libs/pq_proto/Cargo.toml               |  1 +
 libs/remote_storage/Cargo.toml         |  1 +
 libs/safekeeper_api/Cargo.toml         |  1 +
 libs/tenant_size_model/Cargo.toml      |  1 +
 libs/utils/Cargo.toml                  |  1 +
 pageserver/Cargo.toml                  |  1 +
 proxy/Cargo.toml                       |  1 +
 safekeeper/Cargo.toml                  |  1 +
 storage_broker/Cargo.toml              |  1 +
 18 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 deny.toml

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 17c698482c..9021ac48d9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -111,6 +111,7 @@ jobs:
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
         run: make postgres-headers -j$(nproc)
+
       - name: Run cargo clippy
         run: ./run_clippy.sh
 
@@ -126,6 +127,11 @@ jobs:
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
 
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
   build-neon:
     runs-on: [ self-hosted, dev, x64 ]
     container:
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index c40d870649..4c65649610 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -2,6 +2,7 @@
 name = "compute_tools"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 180508a01a..1c6cd6d882 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -2,6 +2,7 @@
 name = "control_plane"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/deny.toml b/deny.toml
new file mode 100644
index 0000000000..3a0fe36f87
--- /dev/null
+++ b/deny.toml
@@ -0,0 +1,90 @@
+# This file was auto-generated using `cargo deny init`.
+# cargo-deny is a cargo plugin that lets you lint your project's
+# dependency graph to ensure all your dependencies conform
+# to your expectations and requirements.
+
+# Root options
+targets = []
+all-features = false
+no-default-features = false
+feature-depth = 1
+
+# This section is considered when running `cargo deny check advisories`
+# More documentation for the advisories section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
+[advisories]
+db-urls = ["https://github.com/rustsec/advisory-db"]
+vulnerability = "deny"
+unmaintained = "warn"
+yanked = "warn"
+notice = "warn"
+ignore = []
+
+# This section is considered when running `cargo deny check licenses`
+# More documentation for the licenses section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
+[licenses]
+unlicensed = "deny"
+allow = [
+    "Apache-2.0",
+    "Artistic-2.0",
+    "BSD-2-Clause",
+    "BSD-3-Clause",
+    "ISC",
+    "MIT",
+    "MPL-2.0",
+    "OpenSSL",
+    "Unicode-DFS-2016",
+]
+deny = []
+copyleft = "warn"
+allow-osi-fsf-free = "neither"
+default = "deny"
+confidence-threshold = 0.8
+exceptions = [
+    # Zlib license has some restrictions if we decide to change sth
+    { allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
+    { allow = ["Zlib"], name = "const_format", version = "*" },
+]
+
+[[licenses.clarify]]
+name = "ring"
+version = "*"
+expression = "MIT AND ISC AND OpenSSL"
+license-files = [
+    { path = "LICENSE", hash = 0xbd0eed23 },
+]
+
+[licenses.private]
+ignore = true
+registries = []
+
+# This section is considered when running `cargo deny check bans`.
+# More documentation about the 'bans' section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
+[bans]
+multiple-versions = "warn"
+wildcards = "allow"
+highlight = "all"
+workspace-default-features = "allow"
+external-default-features = "allow"
+allow = []
+deny = []
+skip = []
+skip-tree = []
+
+# This section is considered when running `cargo deny check sources`.
+# More documentation about the 'sources' section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
+[sources]
+unknown-registry = "warn"
+unknown-git = "warn"
+allow-registry = ["https://github.com/rust-lang/crates.io-index"]
+allow-git = []
+
+[sources.allow-org]
+github = [
+    "neondatabase",
+]
+gitlab = []
+bitbucket = []
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index d0cd46d2a9..d155f1e07d 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -2,6 +2,7 @@
 name = "metrics"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 2102ae5373..68d4c609f0 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pageserver_api"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 1924b260fa..12b7abcc93 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -2,6 +2,7 @@
 name = "postgres_connection"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 59eec3de32..aa076b08d3 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -2,6 +2,7 @@
 name = "postgres_ffi"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 rand = "0.8.3"
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index dd9f82a87a..abfc263550 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -2,7 +2,7 @@
 name = "wal_craft"
 version = "0.1.0"
 edition = "2021"
-
+license = "Apache-2.0"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 76d8fbf28d..daa0b593be 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pq_proto"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index ebd30fc1eb..5a39f27209 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -2,6 +2,7 @@
 name = "remote_storage"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 15bdecd71d..32cda78be4 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -2,6 +2,7 @@
 name = "safekeeper_api"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index 1aabf5a4f9..3a1a0f7915 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -3,6 +3,7 @@ name = "tenant_size_model"
 version = "0.1.0"
 edition = "2021"
 publish = false
+license = "Apache-2.0"
 
 [dependencies]
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 670270b63e..9c7fcafe23 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -2,6 +2,7 @@
 name = "utils"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index c0f3c76c4e..8f112fa670 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pageserver"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [features]
 default = []
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index e630b2758d..0bf47c7b88 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -2,6 +2,7 @@
 name = "proxy"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index fbcb3f34f7..d0c804fe4e 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -2,6 +2,7 @@
 name = "safekeeper"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 async-stream = "0.3"
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 7aa33a5234..180c506254 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -2,6 +2,7 @@
 name = "storage_broker"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [features]
 bench = []

From e9583db73b3a930ca2cbf9267c5e05285cc1016f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 3 Jan 2023 20:11:32 +0200
Subject: [PATCH 105/132] Remove code and test to generate flamegraph on
 GetPage requests. (#3257)

It was nice to have and useful at the time, but unfortunately the method
used to gather the profiling data doesn't play nicely with 'async'. PR
#3228 will turn 'get_page_at_lsn' function async, which will break the
profiling support. Let's remove it, and re-introduce some kind of
profiling later, using some different method, if we feel like we need it
again.
---
 .github/workflows/build_and_test.yml         |   3 +-
 Cargo.lock                                   | 222 ++-----------------
 pageserver/Cargo.toml                        |   3 -
 pageserver/src/bin/pageserver.rs             |   9 +-
 pageserver/src/config.rs                     |  31 ---
 pageserver/src/lib.rs                        |   1 -
 pageserver/src/page_service.rs               |  11 +-
 pageserver/src/profiling.rs                  | 107 ---------
 run_clippy.sh                                |   4 +-
 test_runner/fixtures/neon_fixtures.py        |   4 -
 test_runner/fixtures/utils.py                |   2 +-
 test_runner/performance/README.md            |   8 +-
 test_runner/performance/test_perf_pgbench.py |  24 +-
 workspace_hack/Cargo.toml                    |   3 -
 14 files changed, 26 insertions(+), 406 deletions(-)
 delete mode 100644 pageserver/src/profiling.rs

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9021ac48d9..2b0b0ba2bf 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -183,13 +183,12 @@ jobs:
       # corresponding Cargo.toml files for their descriptions.
       - name: Set env variables
         run: |
+          CARGO_FEATURES="--features testing"
           if [[ $BUILD_TYPE == "debug" ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FEATURES="--features testing"
             CARGO_FLAGS="--locked $CARGO_FEATURES"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
-            CARGO_FEATURES="--features testing,profiling"
             CARGO_FLAGS="--locked --release $CARGO_FEATURES"
           fi
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
diff --git a/Cargo.lock b/Cargo.lock
index ad1fc67219..246d481ef9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -66,12 +66,6 @@ dependencies = [
  "backtrace",
 ]
 
-[[package]]
-name = "arrayvec"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
-
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -633,12 +627,6 @@ version = "3.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
 
-[[package]]
-name = "bytemuck"
-version = "1.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f"
-
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -899,7 +887,7 @@ dependencies = [
  "clap 4.0.29",
  "comfy-table",
  "git-version",
- "nix 0.25.1",
+ "nix",
  "once_cell",
  "pageserver_api",
  "postgres",
@@ -934,15 +922,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
 
-[[package]]
-name = "cpp_demangle"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f"
-dependencies = [
- "cfg-if",
-]
-
 [[package]]
 name = "cpufeatures"
 version = "0.2.5"
@@ -1066,7 +1045,7 @@ dependencies = [
  "crossterm_winapi",
  "libc",
  "mio",
- "parking_lot 0.12.1",
+ "parking_lot",
  "signal-hook",
  "signal-hook-mio",
  "winapi",
@@ -1176,15 +1155,6 @@ version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
 
-[[package]]
-name = "debugid"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730"
-dependencies = [
- "uuid 0.8.2",
-]
-
 [[package]]
 name = "debugid"
 version = "0.8.0"
@@ -1192,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
 dependencies = [
  "serde",
- "uuid 1.2.2",
+ "uuid",
 ]
 
 [[package]]
@@ -1318,18 +1288,6 @@ dependencies = [
  "windows-sys 0.42.0",
 ]
 
-[[package]]
-name = "findshlibs"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
-dependencies = [
- "cc",
- "lazy_static",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "fixedbitset"
 version = "0.4.2"
@@ -1793,24 +1751,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "inferno"
-version = "0.10.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f"
-dependencies = [
- "ahash",
- "atty",
- "indexmap",
- "itoa",
- "lazy_static",
- "log",
- "num-format",
- "quick-xml",
- "rgb",
- "str_stack",
-]
-
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -2037,15 +1977,6 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
-[[package]]
-name = "memmap2"
-version = "0.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.6.5"
@@ -2113,19 +2044,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
-[[package]]
-name = "nix"
-version = "0.23.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c"
-dependencies = [
- "bitflags",
- "cc",
- "cfg-if",
- "libc",
- "memoffset 0.6.5",
-]
-
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -2189,16 +2107,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "num-format"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
-dependencies = [
- "arrayvec",
- "itoa",
-]
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -2315,7 +2223,7 @@ dependencies = [
  "hyper",
  "itertools",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "num-traits",
  "once_cell",
  "pageserver_api",
@@ -2325,7 +2233,6 @@ dependencies = [
  "postgres-types",
  "postgres_connection",
  "postgres_ffi",
- "pprof",
  "pq_proto",
  "rand",
  "regex",
@@ -2369,17 +2276,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "parking_lot"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
-dependencies = [
- "instant",
- "lock_api",
- "parking_lot_core 0.8.5",
-]
-
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2387,21 +2283,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
  "lock_api",
- "parking_lot_core 0.9.5",
-]
-
-[[package]]
-name = "parking_lot_core"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
-dependencies = [
- "cfg-if",
- "instant",
- "libc",
- "redox_syscall",
- "smallvec",
- "winapi",
+ "parking_lot_core",
 ]
 
 [[package]]
@@ -2604,25 +2486,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "pprof"
-version = "0.6.1"
-source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9"
-dependencies = [
- "backtrace",
- "cfg-if",
- "findshlibs",
- "inferno",
- "lazy_static",
- "libc",
- "log",
- "nix 0.23.2",
- "parking_lot 0.11.2",
- "symbolic-demangle",
- "tempfile",
- "thiserror",
-]
-
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -2717,7 +2580,7 @@ dependencies = [
  "lazy_static",
  "libc",
  "memchr",
- "parking_lot 0.12.1",
+ "parking_lot",
  "procfs",
  "thiserror",
 ]
@@ -2798,7 +2661,7 @@ dependencies = [
  "md5",
  "metrics",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "pin-project-lite",
  "pq_proto",
  "rand",
@@ -2822,20 +2685,11 @@ dependencies = [
  "tracing-subscriber",
  "url",
  "utils",
- "uuid 1.2.2",
+ "uuid",
  "workspace_hack",
  "x509-parser",
 ]
 
-[[package]]
-name = "quick-xml"
-version = "0.22.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "quote"
 version = "1.0.21"
@@ -3027,15 +2881,6 @@ dependencies = [
  "winreg",
 ]
 
-[[package]]
-name = "rgb"
-version = "0.8.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3"
-dependencies = [
- "bytemuck",
-]
-
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3216,9 +3061,9 @@ dependencies = [
  "humantime",
  "hyper",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "postgres",
  "postgres-protocol",
  "postgres_ffi",
@@ -3396,7 +3241,7 @@ version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6"
 dependencies = [
- "debugid 0.8.0",
+ "debugid",
  "getrandom",
  "hex",
  "serde",
@@ -3404,7 +3249,7 @@ dependencies = [
  "thiserror",
  "time",
  "url",
- "uuid 1.2.2",
+ "uuid",
 ]
 
 [[package]]
@@ -3626,7 +3471,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "prost",
  "tokio",
  "tokio-stream",
@@ -3637,12 +3482,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "str_stack"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
-
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -3690,29 +3529,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
 
-[[package]]
-name = "symbolic-common"
-version = "8.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540"
-dependencies = [
- "debugid 0.7.3",
- "memmap2",
- "stable_deref_trait",
- "uuid 0.8.2",
-]
-
-[[package]]
-name = "symbolic-demangle"
-version = "8.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750"
-dependencies = [
- "cpp_demangle",
- "rustc-demangle",
- "symbolic-common",
-]
-
 [[package]]
 name = "syn"
 version = "1.0.105"
@@ -3923,7 +3739,7 @@ dependencies = [
  "futures-channel",
  "futures-util",
  "log",
- "parking_lot 0.12.1",
+ "parking_lot",
  "percent-encoding",
  "phf",
  "pin-project-lite",
@@ -4314,7 +4130,7 @@ dependencies = [
  "hyper",
  "jsonwebtoken",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "once_cell",
  "pq_proto",
  "rand",
@@ -4338,12 +4154,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "uuid"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
-
 [[package]]
 name = "uuid"
 version = "1.2.2"
@@ -4658,7 +4468,6 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
- "ahash",
  "anyhow",
  "bytes",
  "chrono",
@@ -4686,7 +4495,6 @@ dependencies = [
  "serde",
  "serde_json",
  "socket2",
- "stable_deref_trait",
  "syn",
  "tokio",
  "tokio-util",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 8f112fa670..1854b6762f 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -10,8 +10,6 @@ default = []
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints"]
 
-profiling = ["pprof"]
-
 [dependencies]
 amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
 anyhow = { version = "1.0", features = ["backtrace"] }
@@ -40,7 +38,6 @@ pin-project-lite = "0.2.7"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
 rstar = "0.9.3"
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a124bf85c2..18ec1ac68b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -13,7 +13,7 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, task_mgr,
+    http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
@@ -40,8 +40,6 @@ const FEATURES: &[&str] = &[
     "testing",
     #[cfg(feature = "fail/failpoints")]
     "fail/failpoints",
-    #[cfg(feature = "profiling")]
-    "profiling",
 ];
 
 fn version() -> String {
@@ -247,9 +245,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     // Install signal handlers
     let signals = signals::install_shutdown_handlers()?;
 
-    // Start profiler (if enabled)
-    let profiler_guard = profiling::init_profiler(conf);
-
     // Launch broker client
     WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
 
@@ -372,7 +367,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "Got {}. Terminating in immediate shutdown mode",
                 signal.name()
             );
-            profiling::exit_profiler(conf, &profiler_guard);
             std::process::exit(111);
         }
 
@@ -381,7 +375,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "Got {}. Terminating gracefully in fast shutdown mode",
                 signal.name()
             );
-            profiling::exit_profiler(conf, &profiler_guard);
             BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
             unreachable!()
         }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index deb79531a4..7b99d98581 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -138,7 +138,6 @@ pub struct PageServerConf {
     pub auth_validation_public_key_path: Option<PathBuf>,
     pub remote_storage_config: Option<RemoteStorageConfig>,
 
-    pub profiling: ProfilingConfig,
     pub default_tenant_conf: TenantConf,
 
     /// Storage broker endpoints to connect to.
@@ -165,25 +164,6 @@ pub struct PageServerConf {
 /// startup code to the connection code through a dozen layers.
 pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum ProfilingConfig {
-    Disabled,
-    PageRequests,
-}
-
-impl FromStr for ProfilingConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
-        let result = match s {
-            "disabled"  => ProfilingConfig::Disabled,
-            "page_requests"  => ProfilingConfig::PageRequests,
-            _ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
-        };
-        Ok(result)
-    }
-}
-
 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
 pub enum BuilderValue<T> {
@@ -226,7 +206,6 @@ struct PageServerConfigBuilder {
 
     id: BuilderValue<NodeId>,
 
-    profiling: BuilderValue<ProfilingConfig>,
     broker_endpoint: BuilderValue<Uri>,
     broker_keepalive_interval: BuilderValue<Duration>,
 
@@ -262,7 +241,6 @@ impl Default for PageServerConfigBuilder {
             auth_validation_public_key_path: Set(None),
             remote_storage_config: Set(None),
             id: NotSet,
-            profiling: Set(ProfilingConfig::Disabled),
             broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint")),
@@ -348,10 +326,6 @@ impl PageServerConfigBuilder {
         self.id = BuilderValue::Set(node_id)
     }
 
-    pub fn profiling(&mut self, profiling: ProfilingConfig) {
-        self.profiling = BuilderValue::Set(profiling)
-    }
-
     pub fn log_format(&mut self, log_format: LogFormat) {
         self.log_format = BuilderValue::Set(log_format)
     }
@@ -405,7 +379,6 @@ impl PageServerConfigBuilder {
                 .remote_storage_config
                 .ok_or(anyhow!("missing remote_storage_config"))?,
             id: self.id.ok_or(anyhow!("missing id"))?,
-            profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
             // TenantConf is handled separately
             default_tenant_conf: TenantConf::default(),
             broker_endpoint: self
@@ -588,7 +561,6 @@ impl PageServerConf {
                     t_conf = Self::parse_toml_tenant_conf(item)?;
                 }
                 "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
-                "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                 "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
@@ -722,7 +694,6 @@ impl PageServerConf {
             auth_type: AuthType::Trust,
             auth_validation_public_key_path: None,
             remote_storage_config: None,
-            profiling: ProfilingConfig::Disabled,
             default_tenant_conf: TenantConf::default(),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
             broker_keepalive_interval: Duration::from_secs(5000),
@@ -898,7 +869,6 @@ log_format = 'json'
                 auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
-                profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 broker_keepalive_interval: humantime::parse_duration(
@@ -949,7 +919,6 @@ log_format = 'json'
                 auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
-                profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 broker_keepalive_interval: Duration::from_secs(5),
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 2f78c199b9..91cde477ad 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -9,7 +9,6 @@ pub(crate) mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
-pub mod profiling;
 pub mod repository;
 pub mod task_mgr;
 pub mod tenant;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5393fca780..f123168211 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -39,10 +39,9 @@ use utils::{
 
 use crate::auth::check_permission;
 use crate::basebackup;
-use crate::config::{PageServerConf, ProfilingConfig};
+use crate::config::PageServerConf;
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
-use crate::profiling::profpoint_start;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr;
@@ -250,7 +249,7 @@ impl PageRequestMetrics {
 
 #[derive(Debug)]
 struct PageServerHandler {
-    conf: &'static PageServerConf,
+    _conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 }
@@ -258,7 +257,7 @@ struct PageServerHandler {
 impl PageServerHandler {
     pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
         PageServerHandler {
-            conf,
+            _conf: conf,
             auth,
             claims: None,
         }
@@ -604,10 +603,6 @@ impl PageServerHandler {
         */
 
         let page = crate::tenant::with_ondemand_download(|| {
-            // FIXME: this profiling now happens at different place than it used to. The
-            // current profiling is based on a thread-local variable, so it doesn't work
-            // across awaits
-            let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
             timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
         })
         .await?;
diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs
deleted file mode 100644
index ad896cfa30..0000000000
--- a/pageserver/src/profiling.rs
+++ /dev/null
@@ -1,107 +0,0 @@
-//!
-//! Support for profiling
-//!
-//! This relies on a modified version of the 'pprof-rs' crate. That's not very
-//! nice, so to avoid a hard dependency on that, this is an optional feature.
-//!
-use crate::config::{PageServerConf, ProfilingConfig};
-
-/// The actual implementation is in the `profiling_impl` submodule. If the profiling
-/// feature is not enabled, it's just a dummy implementation that panics if you
-/// try to enabled profiling in the configuration.
-pub use profiling_impl::*;
-
-#[cfg(feature = "profiling")]
-mod profiling_impl {
-    use super::*;
-    use pprof;
-    use std::marker::PhantomData;
-
-    /// Start profiling the current thread. Returns a guard object;
-    /// the profiling continues until the guard is dropped.
-    ///
-    /// Note: profiling is not re-entrant. If you call 'profpoint_start' while
-    /// profiling is already started, nothing happens, and the profiling will be
-    /// stopped when either guard object is dropped.
-    #[inline]
-    pub fn profpoint_start(
-        conf: &crate::config::PageServerConf,
-        point: ProfilingConfig,
-    ) -> Option<ProfilingGuard> {
-        if conf.profiling == point {
-            pprof::start_profiling();
-            Some(ProfilingGuard(PhantomData))
-        } else {
-            None
-        }
-    }
-
-    /// A hack to remove Send and Sync from the ProfilingGuard. Because the
-    /// profiling is attached to current thread.
-    ////
-    /// See comments in https://github.com/rust-lang/rust/issues/68318
-    type PhantomUnsend = std::marker::PhantomData<*mut u8>;
-
-    pub struct ProfilingGuard(PhantomUnsend);
-
-    impl Drop for ProfilingGuard {
-        fn drop(&mut self) {
-            pprof::stop_profiling();
-        }
-    }
-
-    /// Initialize the profiler. This must be called before any 'profpoint_start' calls.
-    pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
-        if conf.profiling != ProfilingConfig::Disabled {
-            Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
-        } else {
-            None
-        }
-    }
-
-    /// Exit the profiler. Writes the flamegraph to current workdir.
-    pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
-        // Write out the flamegraph
-        if let Some(profiler_guard) = profiler_guard {
-            if let Ok(report) = profiler_guard.report().build() {
-                // this gets written under the workdir
-                let file = std::fs::File::create("flamegraph.svg").unwrap();
-                let mut options = pprof::flamegraph::Options::default();
-                options.image_width = Some(2500);
-                report.flamegraph_with_options(file, &mut options).unwrap();
-            }
-        }
-    }
-}
-
-/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
-#[cfg(not(feature = "profiling"))]
-mod profiling_impl {
-    use super::*;
-
-    pub struct DummyProfilerGuard;
-
-    impl Drop for DummyProfilerGuard {
-        fn drop(&mut self) {
-            // do nothing, this exists to calm Clippy down
-        }
-    }
-
-    pub fn profpoint_start(
-        _conf: &PageServerConf,
-        _point: ProfilingConfig,
-    ) -> Option<DummyProfilerGuard> {
-        None
-    }
-
-    pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
-        if conf.profiling != ProfilingConfig::Disabled {
-            // shouldn't happen, we don't allow profiling in the config if the support
-            // for it is disabled.
-            panic!("profiling enabled but the binary was compiled without profiling support");
-        }
-        None
-    }
-
-    pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
-}
diff --git a/run_clippy.sh b/run_clippy.sh
index bf770432d0..fe0e745d7d 100755
--- a/run_clippy.sh
+++ b/run_clippy.sh
@@ -9,8 +9,8 @@
 # In vscode, this setting is Rust-analyzer>Check On Save:Command
 
 
-# Not every feature is supported in macOS builds, e.g. `profiling`,
-# avoid running regular linting script that checks every feature.
+# Not every feature is supported in macOS builds. Avoid running regular linting
+# script that checks every feature.
 if [[ "$OSTYPE" == "darwin"* ]]; then
     # no extra features to test currently, add more here when needed
     cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b00ebdea7..705ab70ab4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1980,10 +1980,6 @@ class NeonPageserver(PgProtocol):
         if '"testing"' not in self.version:
             pytest.skip("pageserver was built without 'testing' feature")
 
-    def is_profiling_enabled_or_skip(self):
-        if '"profiling"' not in self.version:
-            pytest.skip("pageserver was built without 'profiling' feature")
-
     def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient:
         return PageserverHttpClient(
             port=self.service_port.http,
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 1fb9eb72e6..df83fc6377 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -148,7 +148,7 @@ def get_scale_for_db(size_mb: int) -> int:
 
 
 ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
+    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
 )
 
 
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index a32ce87c33..c1a57fb28b 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -1,12 +1,8 @@
 # Running locally
 
-First make a release build. The profiling flag is optional, used only for tests that
-generate flame graphs. The `-s` flag just silences a lot of output, and makes it
+First make a release build. The `-s` flag silences a lot of output, and makes it
 easier to see if you have compile errors without scrolling up.
-`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
-
-NOTE: the `profiling` flag only works on linux because we use linux-specific
-libc APIs like `libc::timer_t`.
+`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`
 
 Then run the tests
 `NEON_BIN=./target/release poetry run pytest test_runner/performance"`
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index 50e5366c1e..2b8760dff2 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -8,7 +8,7 @@ from typing import Dict, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult
-from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.compare_fixtures import PgCompare
 from fixtures.utils import get_scale_for_db
 
 
@@ -176,28 +176,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int):
     run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY)
 
 
-# Run the pgbench tests, and generate a flamegraph from it
-# This requires that the pageserver was built with the 'profiling' feature.
-#
-# TODO: If the profiling is cheap enough, there's no need to run the same test
-# twice, with and without profiling. But for now, run it separately, so that we
-# can see how much overhead the profiling adds.
-@pytest.mark.parametrize("scale", get_scales_matrix())
-@pytest.mark.parametrize("duration", get_durations_matrix())
-def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int):
-    neon_env_builder.pageserver_config_override = """
-profiling="page_requests"
-"""
-    env = neon_env_builder.init_start()
-    env.pageserver.is_profiling_enabled_or_skip()
-    env.neon_cli.create_branch("empty", "main")
-
-    neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench")
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT)
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
-
-
 # The following 3 tests run on an existing database as it was set up by previous tests,
 # and leaves the database in a state that would be used in the next tests.
 # Modifying the definition order of these functions or adding other remote tests in between will alter results.
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 4c7fbd8333..989cc9202e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -13,7 +13,6 @@ publish = false
 
 ### BEGIN HAKARI SECTION
 [dependencies]
-ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] }
@@ -41,7 +40,6 @@ scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-stable_deref_trait = { version = "1", features = ["alloc", "std"] }
 tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
 tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
 tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
@@ -50,7 +48,6 @@ tracing-core = { version = "0.1", features = ["once_cell", "std"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]
-ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
 either = { version = "1", features = ["use_std"] }

From 10dae79c6d78c8c0876dbd27bf46da8ca8b3b1ff Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 3 Jan 2023 22:42:04 +0200
Subject: [PATCH 106/132] Tone down safekeeper and pageserver walreceiver
 errors (#3227)

Closes https://github.com/neondatabase/neon/issues/3114

Adds more typization into errors that appear during protocol messages (`FeMessage`), postgres and walreceiver connections.

Socket IO errors are now better detected and logged with lesser (INFO, DEBUG) error level, without traces that they were logged before, when they were wrapped in anyhow context.
---
 Cargo.lock                                    |   1 +
 libs/pq_proto/Cargo.toml                      |   1 +
 libs/pq_proto/src/lib.rs                      | 131 +++++++---
 libs/utils/src/postgres_backend.rs            | 130 +++++-----
 libs/utils/src/postgres_backend_async.rs      | 174 +++++++++----
 libs/utils/tests/ssl_test.rs                  |  11 +-
 pageserver/src/page_service.rs                | 239 ++++++++++++------
 .../src/walreceiver/walreceiver_connection.rs |  66 +++--
 proxy/src/mgmt.rs                             |  15 +-
 proxy/src/stream.rs                           |  17 +-
 safekeeper/src/bin/safekeeper.rs              |   6 +-
 safekeeper/src/handler.rs                     |  65 +++--
 safekeeper/src/json_ctrl.rs                   |  23 +-
 safekeeper/src/receive_wal.rs                 |  47 ++--
 safekeeper/src/send_wal.rs                    |  22 +-
 safekeeper/src/wal_service.rs                 |   6 +-
 test_runner/fixtures/neon_fixtures.py         |  14 +-
 test_runner/regress/test_wal_acceptor.py      |   1 -
 18 files changed, 635 insertions(+), 334 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 246d481ef9..fbf018e1c0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2502,6 +2502,7 @@ dependencies = [
  "postgres-protocol",
  "rand",
  "serde",
+ "thiserror",
  "tokio",
  "tracing",
  "workspace_hack",
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index daa0b593be..b9c6a1eab0 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -13,5 +13,6 @@ rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
 tracing = "0.1"
+thiserror = "1.0"
 
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index d31a2d51f2..c5e4dbd1f0 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -5,7 +5,7 @@
 // Tools for calling certain async methods in sync contexts.
 pub mod sync;
 
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
 use serde::{Deserialize, Serialize};
@@ -194,6 +194,35 @@ macro_rules! retry_read {
     };
 }
 
+/// An error occured during connection being open.
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionError {
+    /// IO error during writing to or reading from the connection socket.
+    #[error("Socket IO error: {0}")]
+    Socket(std::io::Error),
+    /// Invalid packet was received from client
+    #[error("Protocol error: {0}")]
+    Protocol(String),
+    /// Failed to parse a protocol mesage
+    #[error("Message parse error: {0}")]
+    MessageParse(anyhow::Error),
+}
+
+impl From<anyhow::Error> for ConnectionError {
+    fn from(e: anyhow::Error) -> Self {
+        Self::MessageParse(e)
+    }
+}
+
+impl ConnectionError {
+    pub fn into_io_error(self) -> io::Error {
+        match self {
+            ConnectionError::Socket(io) => io,
+            other => io::Error::new(io::ErrorKind::Other, other.to_string()),
+        }
+    }
+}
+
 impl FeMessage {
     /// Read one message from the stream.
     /// This function returns `Ok(None)` in case of EOF.
@@ -216,7 +245,9 @@ impl FeMessage {
     /// }
     /// ```
     #[inline(never)]
-    pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
+    pub fn read(
+        stream: &mut (impl io::Read + Unpin),
+    ) -> Result<Option<FeMessage>, ConnectionError> {
         Self::read_fut(&mut AsyncishRead(stream)).wait()
     }
 
@@ -224,7 +255,7 @@ impl FeMessage {
     /// See documentation for `Self::read`.
     pub fn read_fut<Reader>(
         stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
+    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
     where
         Reader: tokio::io::AsyncRead + Unpin,
     {
@@ -238,17 +269,21 @@ impl FeMessage {
             let tag = match retry_read!(stream.read_u8().await) {
                 Ok(b) => b,
                 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(e.into()),
+                Err(e) => return Err(ConnectionError::Socket(e)),
             };
 
             // The message length includes itself, so it better be at least 4.
-            let len = retry_read!(stream.read_u32().await)?
+            let len = retry_read!(stream.read_u32().await)
+                .map_err(ConnectionError::Socket)?
                 .checked_sub(4)
-                .context("invalid message length")?;
+                .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
 
             let body = {
                 let mut buffer = vec![0u8; len as usize];
-                stream.read_exact(&mut buffer).await?;
+                stream
+                    .read_exact(&mut buffer)
+                    .await
+                    .map_err(ConnectionError::Socket)?;
                 Bytes::from(buffer)
             };
 
@@ -265,7 +300,11 @@ impl FeMessage {
                 b'c' => Ok(Some(FeMessage::CopyDone)),
                 b'f' => Ok(Some(FeMessage::CopyFail)),
                 b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
-                tag => bail!("unknown message tag: {},'{:?}'", tag, body),
+                tag => {
+                    return Err(ConnectionError::Protocol(format!(
+                        "unknown message tag: {tag},'{body:?}'"
+                    )))
+                }
             }
         })
     }
@@ -275,7 +314,9 @@ impl FeStartupPacket {
     /// Read startup message from the stream.
     // XXX: It's tempting yet undesirable to accept `stream` by value,
     // since such a change will cause user-supplied &mut references to be consumed
-    pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
+    pub fn read(
+        stream: &mut (impl io::Read + Unpin),
+    ) -> Result<Option<FeMessage>, ConnectionError> {
         Self::read_fut(&mut AsyncishRead(stream)).wait()
     }
 
@@ -284,7 +325,7 @@ impl FeStartupPacket {
     // since such a change will cause user-supplied &mut references to be consumed
     pub fn read_fut<Reader>(
         stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
+    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
     where
         Reader: tokio::io::AsyncRead + Unpin,
     {
@@ -302,31 +343,41 @@ impl FeStartupPacket {
             let len = match retry_read!(stream.read_u32().await) {
                 Ok(len) => len as usize,
                 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(e.into()),
+                Err(e) => return Err(ConnectionError::Socket(e)),
             };
 
             #[allow(clippy::manual_range_contains)]
             if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
-                bail!("invalid message length");
+                return Err(ConnectionError::Protocol(format!(
+                    "invalid message length {len}"
+                )));
             }
 
-            let request_code = retry_read!(stream.read_u32().await)?;
+            let request_code =
+                retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
 
             // the rest of startup packet are params
             let params_len = len - 8;
             let mut params_bytes = vec![0u8; params_len];
-            stream.read_exact(params_bytes.as_mut()).await?;
+            stream
+                .read_exact(params_bytes.as_mut())
+                .await
+                .map_err(ConnectionError::Socket)?;
 
             // Parse params depending on request code
             let req_hi = request_code >> 16;
             let req_lo = request_code & ((1 << 16) - 1);
             let message = match (req_hi, req_lo) {
                 (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
-                    ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
+                    if params_len != 8 {
+                        return Err(ConnectionError::Protocol(
+                            "expected 8 bytes for CancelRequest params".to_string(),
+                        ));
+                    }
                     let mut cursor = Cursor::new(params_bytes);
                     FeStartupPacket::CancelRequest(CancelKeyData {
-                        backend_pid: cursor.read_i32().await?,
-                        cancel_key: cursor.read_i32().await?,
+                        backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
+                        cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
                     })
                 }
                 (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
@@ -338,7 +389,9 @@ impl FeStartupPacket {
                     FeStartupPacket::GssEncRequest
                 }
                 (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
-                    bail!("Unrecognized request code {}", unrecognized_code)
+                    return Err(ConnectionError::Protocol(format!(
+                        "Unrecognized request code {unrecognized_code}"
+                    )));
                 }
                 // TODO bail if protocol major_version is not 3?
                 (major_version, minor_version) => {
@@ -346,15 +399,21 @@ impl FeStartupPacket {
                     // See `postgres: ProcessStartupPacket, build_startup_packet`.
                     let mut tokens = str::from_utf8(&params_bytes)
                         .context("StartupMessage params: invalid utf-8")?
-                        .strip_suffix('\0') // drop packet's own null terminator
-                        .context("StartupMessage params: missing null terminator")?
+                        .strip_suffix('\0') // drop packet's own null
+                        .ok_or_else(|| {
+                            ConnectionError::Protocol(
+                                "StartupMessage params: missing null terminator".to_string(),
+                            )
+                        })?
                         .split_terminator('\0');
 
                     let mut params = HashMap::new();
                     while let Some(name) = tokens.next() {
-                        let value = tokens
-                            .next()
-                            .context("StartupMessage params: key without value")?;
+                        let value = tokens.next().ok_or_else(|| {
+                            ConnectionError::Protocol(
+                                "StartupMessage params: key without value".to_string(),
+                            )
+                        })?;
 
                         params.insert(name.to_owned(), value.to_owned());
                     }
@@ -458,7 +517,7 @@ pub enum BeMessage<'a> {
     CloseComplete,
     // None means column is NULL
     DataRow(&'a [Option<&'a [u8]>]),
-    ErrorResponse(&'a str),
+    ErrorResponse(&'a str, Option<&'a [u8; 5]>),
     /// Single byte - used in response to SSLRequest/GSSENCRequest.
     EncryptionResponse(bool),
     NoData,
@@ -606,7 +665,7 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }
 
 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
     let bytes = s.as_ref();
     if bytes.contains(&0) {
         return Err(io::Error::new(
@@ -626,7 +685,7 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
     Ok(result)
 }
 
-const SQLSTATE_INTERNAL_ERROR: &str = "XX000\0";
+pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
 
 impl<'a> BeMessage<'a> {
     /// Write message to the given buf.
@@ -767,10 +826,7 @@ impl<'a> BeMessage<'a> {
             // First byte of each field represents type of this field. Set just enough fields
             // to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
             // message text.
-            BeMessage::ErrorResponse(error_msg) => {
-                // For all the errors set Severity to Error and error code to
-                // 'internal error'.
-
+            BeMessage::ErrorResponse(error_msg, pg_error_code) => {
                 // 'E' signalizes ErrorResponse messages
                 buf.put_u8(b'E');
                 write_body(buf, |buf| {
@@ -778,7 +834,9 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"ERROR\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
+                    buf.put_slice(&terminate_code(
+                        pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR),
+                    ));
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg, buf)?;
@@ -801,7 +859,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"NOTICE\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
+                    buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR));
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg.as_bytes(), buf)?;
@@ -1089,3 +1147,12 @@ mod tests {
         let _ = FeStartupPacket::read_fut(stream).await;
     }
 }
+
+fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
+    let mut terminated = [0; 6];
+    for (i, &elem) in code.iter().enumerate() {
+        terminated[i] = elem;
+    }
+
+    terminated
+}
diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
index bac6f861c3..f3e3835bda 100644
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -3,8 +3,9 @@
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
 
+use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use serde::{Deserialize, Serialize};
@@ -21,20 +22,32 @@ pub trait Handler {
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care).
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
+    fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
 
     /// Called on startup packet receival, allows to process params.
     ///
     /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
     /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
     /// to override whole init logic in implementations.
-    fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         Ok(())
     }
 
     /// Check auth jwt
-    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
-        bail!("JWT auth failed")
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
     }
 
     fn is_shutdown_requested(&self) -> bool {
@@ -66,7 +79,7 @@ impl FromStr for AuthType {
         match s {
             "Trust" => Ok(Self::Trust),
             "NeonJWT" => Ok(Self::NeonJWT),
-            _ => bail!("invalid value \"{s}\" for auth type"),
+            _ => anyhow::bail!("invalid value \"{s}\" for auth type"),
         }
     }
 }
@@ -154,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
 }
 
 // Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
     std::str::from_utf8(without_null).map_err(|e| e.into())
 }
@@ -188,10 +201,10 @@ impl PostgresBackend {
     }
 
     /// Get direct reference (into the Option) to the read stream.
-    fn get_stream_in(&mut self) -> Result<&mut BidiStream> {
+    fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
         match &mut self.stream {
             Some(Stream::Bidirectional(stream)) => Ok(stream),
-            _ => bail!("reader taken"),
+            _ => anyhow::bail!("reader taken"),
         }
     }
 
@@ -215,7 +228,7 @@ impl PostgresBackend {
     }
 
     /// Read full message or return None if connection is closed.
-    pub fn read_message(&mut self) -> Result<Option<FeMessage>> {
+    pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
         let (state, stream) = (self.state, self.get_stream_in()?);
 
         use ProtoState::*;
@@ -223,6 +236,7 @@ impl PostgresBackend {
             Initialization | Encrypted => FeStartupPacket::read(stream),
             Authentication | Established => FeMessage::read(stream),
         }
+        .map_err(QueryError::from)
     }
 
     /// Write message into internal output buffer.
@@ -246,7 +260,7 @@ impl PostgresBackend {
     }
 
     // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub fn run(mut self, handler: &mut impl Handler) -> Result<()> {
+    pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
         let ret = self.run_message_loop(handler);
         if let Some(stream) = self.stream.as_mut() {
             let _ = stream.shutdown(Shutdown::Both);
@@ -254,7 +268,7 @@ impl PostgresBackend {
         ret
     }
 
-    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> {
+    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
         trace!("postgres backend to {:?} started", self.peer_addr);
 
         let mut unnamed_query_string = Bytes::new();
@@ -263,7 +277,7 @@ impl PostgresBackend {
             match self.read_message() {
                 Ok(message) => {
                     if let Some(msg) = message {
-                        trace!("got message {:?}", msg);
+                        trace!("got message {msg:?}");
 
                         match self.process_message(handler, msg, &mut unnamed_query_string)? {
                             ProcessMsgResult::Continue => continue,
@@ -274,10 +288,12 @@ impl PostgresBackend {
                     }
                 }
                 Err(e) => {
-                    // If it is a timeout error, continue the loop
-                    if !is_socket_read_timed_out(&e) {
-                        return Err(e);
+                    if let QueryError::Other(e) = &e {
+                        if is_socket_read_timed_out(e) {
+                            continue;
+                        }
                     }
+                    return Err(e);
                 }
             }
         }
@@ -295,7 +311,7 @@ impl PostgresBackend {
             }
             stream => {
                 self.stream = stream;
-                bail!("can't start TLs without bidi stream");
+                anyhow::bail!("can't start TLs without bidi stream");
             }
         }
     }
@@ -305,17 +321,16 @@ impl PostgresBackend {
         handler: &mut impl Handler,
         msg: FeMessage,
         unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
         // TODO: change that to proper top-level match of protocol state with separate message handling for each state
-        if self.state < ProtoState::Established {
-            ensure!(
-                matches!(
-                    msg,
-                    FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
-                ),
-                "protocol violation"
-            );
+        if self.state < ProtoState::Established
+            && !matches!(
+                msg,
+                FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
+            )
+        {
+            return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
         }
 
         let have_tls = self.tls_config.is_some();
@@ -339,8 +354,13 @@ impl PostgresBackend {
                     }
                     FeStartupPacket::StartupMessage { .. } => {
                         if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
-                            bail!("client did not connect with TLS");
+                            self.write_message(&BeMessage::ErrorResponse(
+                                "must connect with TLS",
+                                None,
+                            ))?;
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "client did not connect with TLS"
+                            )));
                         }
 
                         // NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -379,8 +399,11 @@ impl PostgresBackend {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
                         if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
+                            self.write_message(&BeMessage::ErrorResponse(
+                                &e.to_string(),
+                                Some(e.pg_error_code()),
+                            ))?;
+                            return Err(e);
                         }
                     }
                 }
@@ -394,33 +417,14 @@ impl PostgresBackend {
                 // remove null terminator
                 let query_string = cstr_to_str(&body)?;
 
-                trace!("got query {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string) {
-                    // ":?" uses the alternate formatting style, which makes anyhow display the
-                    // full cause of the error, not just the top-level context + its trace.
-                    // We don't want to send that in the ErrorResponse though,
-                    // because it's not relevant to the compute node logs.
-                    //
-                    // We also don't want to log full stacktrace when the error is primitive,
-                    // such as usual connection closed.
-                    let short_error = format!("{:#}", e);
-                    let root_cause = e.root_cause().to_string();
-                    if root_cause.contains("connection closed unexpectedly")
-                        || root_cause.contains("Broken pipe (os error 32)")
-                    {
-                        error!(
-                            "query handler for '{}' failed: {}",
-                            query_string, short_error
-                        );
-                    } else {
-                        error!("query handler for '{}' failed: {:?}", query_string, e);
-                    }
-                    self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?;
-                    // TODO: untangle convoluted control flow
-                    if e.to_string().contains("failed to run") {
-                        return Ok(ProcessMsgResult::Break);
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 self.write_message(&BeMessage::ReadyForQuery)?;
             }
@@ -445,11 +449,13 @@ impl PostgresBackend {
 
             FeMessage::Execute(_) => {
                 let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got execute {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string) {
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                    log_query_error(query_string, &e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 // NOTE there is no ReadyForQuery message. This handler is used
                 // for basebackup and it uses CopyOut which doesn't require
@@ -468,7 +474,9 @@ impl PostgresBackend {
             // We prefer explicit pattern matching to wildcards, because
             // this helps us spot the places where new variants are missing
             FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                bail!("unexpected message type: {:?}", msg);
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {msg:?}"
+                )));
             }
         }
 
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index de547c3242..a4f523da04 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -4,39 +4,84 @@
 //! is rather narrow, but we can extend it once required.
 
 use crate::postgres_backend::AuthType;
-use anyhow::{bail, Context, Result};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
 use std::future::Future;
+use std::io;
 use std::net::SocketAddr;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::Poll;
-use tracing::{debug, error, trace};
+use tracing::{debug, error, info, trace};
 
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
 use tokio_rustls::TlsAcceptor;
 
+pub fn is_expected_io_error(e: &io::Error) -> bool {
+    use io::ErrorKind::*;
+    matches!(e.kind(), ConnectionRefused | ConnectionAborted)
+}
+
+/// An error, occurred during query processing:
+/// either during the connection ([`ConnectionError`]) or before/after it.
+#[derive(thiserror::Error, Debug)]
+pub enum QueryError {
+    /// The connection was lost while processing the query.
+    #[error(transparent)]
+    Disconnected(#[from] ConnectionError),
+    /// Some other error
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<io::Error> for QueryError {
+    fn from(e: io::Error) -> Self {
+        Self::Disconnected(ConnectionError::Socket(e))
+    }
+}
+
+impl QueryError {
+    pub fn pg_error_code(&self) -> &'static [u8; 5] {
+        match self {
+            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
+        }
+    }
+}
+
 #[async_trait::async_trait]
 pub trait Handler {
     /// Handle single query.
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care).
-    async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
 
     /// Called on startup packet receival, allows to process params.
     ///
     /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
     /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
     /// to override whole init logic in implementations.
-    fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         Ok(())
     }
 
     /// Check auth jwt
-    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
-        bail!("JWT auth failed")
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
     }
 }
 
@@ -70,17 +115,14 @@ impl AsyncWrite for Stream {
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
         buf: &[u8],
-    ) -> Poll<Result<usize, std::io::Error>> {
+    ) -> Poll<io::Result<usize>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
             Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
             Self::Broken => unreachable!(),
         }
     }
-    fn poll_flush(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
             Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
@@ -90,7 +132,7 @@ impl AsyncWrite for Stream {
     fn poll_shutdown(
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    ) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
             Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
@@ -103,7 +145,7 @@ impl AsyncRead for Stream {
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
         buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    ) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
             Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
@@ -139,7 +181,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
 }
 
 // Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
     std::str::from_utf8(without_null).map_err(|e| e.into())
 }
@@ -149,7 +191,7 @@ impl PostgresBackend {
         socket: tokio::net::TcpStream,
         auth_type: AuthType,
         tls_config: Option<Arc<rustls::ServerConfig>>,
-    ) -> std::io::Result<Self> {
+    ) -> io::Result<Self> {
         let peer_addr = socket.peer_addr()?;
 
         Ok(Self {
@@ -167,17 +209,18 @@ impl PostgresBackend {
     }
 
     /// Read full message or return None if connection is closed.
-    pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
         use ProtoState::*;
         match self.state {
             Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
             Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
             Closed => Ok(None),
         }
+        .map_err(QueryError::from)
     }
 
     /// Flush output buffer into the socket.
-    pub async fn flush(&mut self) -> std::io::Result<()> {
+    pub async fn flush(&mut self) -> io::Result<()> {
         while self.buf_out.has_remaining() {
             let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
             self.buf_out.advance(bytes_written);
@@ -187,7 +230,7 @@ impl PostgresBackend {
     }
 
     /// Write message into internal output buffer.
-    pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
+    pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
         BeMessage::write(&mut self.buf_out, message)?;
         Ok(self)
     }
@@ -223,7 +266,11 @@ impl PostgresBackend {
     }
 
     // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
+    pub async fn run<F, S>(
+        mut self,
+        handler: &mut impl Handler,
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
     where
         F: Fn() -> S,
         S: Future,
@@ -237,7 +284,7 @@ impl PostgresBackend {
         &mut self,
         handler: &mut impl Handler,
         shutdown_watcher: F,
-    ) -> Result<()>
+    ) -> Result<(), QueryError>
     where
         F: Fn() -> S,
         S: Future,
@@ -273,7 +320,7 @@ impl PostgresBackend {
                         return Ok(());
                     }
                 }
-                Ok::<(), anyhow::Error>(())
+                Ok::<(), QueryError>(())
             } => {
                 // Handshake complete.
                 result?;
@@ -318,14 +365,14 @@ impl PostgresBackend {
             self.stream = Stream::Tls(Box::new(tls_stream));
             return Ok(());
         };
-        bail!("TLS already started");
+        anyhow::bail!("TLS already started");
     }
 
     async fn process_handshake_message(
         &mut self,
         handler: &mut impl Handler,
         msg: FeMessage,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         assert!(self.state < ProtoState::Established);
         let have_tls = self.tls_config.is_some();
         match msg {
@@ -348,8 +395,13 @@ impl PostgresBackend {
                     }
                     FeStartupPacket::StartupMessage { .. } => {
                         if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
-                            bail!("client did not connect with TLS");
+                            self.write_message(&BeMessage::ErrorResponse(
+                                "must connect with TLS",
+                                None,
+                            ))?;
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "client did not connect with TLS"
+                            )));
                         }
 
                         // NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -389,8 +441,11 @@ impl PostgresBackend {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
                         if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
+                            self.write_message(&BeMessage::ErrorResponse(
+                                &e.to_string(),
+                                Some(e.pg_error_code()),
+                            ))?;
+                            return Err(e);
                         }
                     }
                 }
@@ -413,33 +468,28 @@ impl PostgresBackend {
         handler: &mut impl Handler,
         msg: FeMessage,
         unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
         // TODO: change that to proper top-level match of protocol state with separate message handling for each state
         assert!(self.state == ProtoState::Established);
 
         match msg {
             FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
-                bail!("protocol violation");
+                return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
             }
 
             FeMessage::Query(body) => {
                 // remove null terminator
                 let query_string = cstr_to_str(&body)?;
 
-                trace!("got query {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
-                    // ":?" uses the alternate formatting style, which makes anyhow display the
-                    // full cause of the error, not just the top-level context + its trace.
-                    // We don't want to send that in the ErrorResponse though,
-                    // because it's not relevant to the compute node logs.
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                    // TODO: untangle convoluted control flow
-                    if e.to_string().contains("failed to run") {
-                        return Ok(ProcessMsgResult::Break);
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 self.write_message(&BeMessage::ReadyForQuery)?;
             }
@@ -464,11 +514,13 @@ impl PostgresBackend {
 
             FeMessage::Execute(_) => {
                 let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got execute {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                    log_query_error(query_string, &e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 // NOTE there is no ReadyForQuery message. This handler is used
                 // for basebackup and it uses CopyOut which doesn't require
@@ -487,7 +539,10 @@ impl PostgresBackend {
             // We prefer explicit pattern matching to wildcards, because
             // this helps us spot the places where new variants are missing
             FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                bail!("unexpected message type: {:?}", msg);
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {:?}",
+                    msg
+                )));
             }
         }
 
@@ -555,3 +610,28 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
         this.pgb.poll_flush(cx)
     }
 }
+
+pub fn short_error(e: &QueryError) -> String {
+    match e {
+        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Other(e) => format!("{e:#}"),
+    }
+}
+
+pub(super) fn log_query_error(query: &str, e: &QueryError) {
+    match e {
+        QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
+            if is_expected_io_error(io_error) {
+                info!("query handler for '{query}' failed with expected io error: {io_error}");
+            } else {
+                error!("query handler for '{query}' failed with io error: {io_error}");
+            }
+        }
+        QueryError::Disconnected(other_connection_error) => {
+            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
+        }
+        QueryError::Other(e) => {
+            error!("query handler for '{query}' failed: {e:?}");
+        }
+    }
+}
diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs
index 248400c2c1..fae707f049 100644
--- a/libs/utils/tests/ssl_test.rs
+++ b/libs/utils/tests/ssl_test.rs
@@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use once_cell::sync::Lazy;
 
-use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
+use utils::{
+    postgres_backend::{AuthType, Handler, PostgresBackend},
+    postgres_backend_async::QueryError,
+};
 
 fn make_tcp_pair() -> (TcpStream, TcpStream) {
     let listener = TcpListener::bind("127.0.0.1:0").unwrap();
@@ -105,7 +108,7 @@ fn ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             self.got_query = query_string == QUERY;
             Ok(())
         }
@@ -152,7 +155,7 @@ fn no_ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             _query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             panic!()
         }
     }
@@ -212,7 +215,7 @@ fn server_forces_ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             _query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             panic!()
         }
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f123168211..4087a8f90c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -9,7 +9,7 @@
 //  custom protocol.
 //
 
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
@@ -19,6 +19,8 @@ use pageserver_api::models::{
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
+use pq_proto::ConnectionError;
+use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
 use std::net::TcpListener;
@@ -28,6 +30,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use tracing::*;
 use utils::id::ConnectionId;
+use utils::postgres_backend_async::QueryError;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
@@ -60,8 +63,8 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                 _ = task_mgr::shutdown_watcher() => {
                     // We were requested to shut down.
                     let msg = format!("pageserver is shutting down");
-                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg));
-                    Err(anyhow::anyhow!(msg))
+                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
+                    Err(QueryError::Other(anyhow::anyhow!(msg)))
                 }
 
                 msg = pgb.read_message() => { msg }
@@ -74,14 +77,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                         FeMessage::CopyDone => { break },
                         FeMessage::Sync => continue,
                         FeMessage::Terminate => {
-                            let msg = format!("client terminated connection with Terminate message during COPY");
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            let msg = "client terminated connection with Terminate message during COPY";
+                            let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                            pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
                             Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                             break;
                         }
                         m => {
-                            let msg = format!("unexpected message {:?}", m);
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            let msg = format!("unexpected message {m:?}");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
                             Err(io::Error::new(io::ErrorKind::Other, msg))?;
                             break;
                         }
@@ -91,12 +95,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                 }
                 Ok(None) => {
                     let msg = "client closed connection during COPY";
-                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
+                    let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                    pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
                     pgb.flush().await?;
                     Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                 }
-                Err(e) => {
-                    Err(io::Error::new(io::ErrorKind::Other, e))?;
+                Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
+                    Err(io_error)?;
+                }
+                Err(other) => {
+                    Err(io::Error::new(io::ErrorKind::Other, other))?;
                 }
             };
         }
@@ -194,23 +202,19 @@ async fn page_service_conn_main(
             // we've been requested to shut down
             Ok(())
         }
-        Err(err) => {
-            let root_cause_io_err_kind = err
-                .root_cause()
-                .downcast_ref::<io::Error>()
-                .map(|e| e.kind());
-
+        Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
             // `ConnectionReset` error happens when the Postgres client closes the connection.
             // As this disconnection happens quite often and is expected,
             // we decided to downgrade the logging level to `INFO`.
             // See: https://github.com/neondatabase/neon/issues/1683.
-            if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
+            if io_error.kind() == io::ErrorKind::ConnectionReset {
                 info!("Postgres client disconnected");
                 Ok(())
             } else {
-                Err(err)
+                Err(io_error).context("Postgres connection error")
             }
         }
+        other => other.context("Postgres query error"),
     }
 }
 
@@ -312,7 +316,7 @@ impl PageServerHandler {
                 Some(FeMessage::CopyData(bytes)) => bytes,
                 Some(FeMessage::Terminate) => break,
                 Some(m) => {
-                    bail!("unexpected message: {m:?} during COPY");
+                    anyhow::bail!("unexpected message: {m:?} during COPY");
                 }
                 None => break, // client disconnected
             };
@@ -369,7 +373,7 @@ impl PageServerHandler {
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
@@ -423,11 +427,16 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
-        ensure!(timeline.get_last_record_lsn() == start_lsn);
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
 
         // TODO leave clean state on error. For now you can use detach to clean
         // up broken state from a failed import.
@@ -451,7 +460,11 @@ impl PageServerHandler {
         }
 
         // TODO Does it make sense to overshoot?
-        ensure!(timeline.get_last_record_lsn() >= end_lsn);
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
 
         // Flush data to disk, then upload to s3. No need for a forced checkpoint.
         // We only want to persist the data, and it doesn't matter if it's in the
@@ -480,7 +493,7 @@ impl PageServerHandler {
         mut lsn: Lsn,
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
-    ) -> Result<Lsn> {
+    ) -> anyhow::Result<Lsn> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
             // to the page server that there have been no modifications to the
@@ -511,11 +524,11 @@ impl PageServerHandler {
             }
         } else {
             if lsn == Lsn(0) {
-                bail!("invalid LSN(0) in request");
+                anyhow::bail!("invalid LSN(0) in request");
             }
             timeline.wait_lsn(lsn).await?;
         }
-        ensure!(
+        anyhow::ensure!(
             lsn >= **latest_gc_cutoff_lsn,
             "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
             lsn, **latest_gc_cutoff_lsn
@@ -528,7 +541,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -548,7 +561,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -568,7 +581,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -589,7 +602,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -656,7 +669,7 @@ impl PageServerHandler {
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
         if self.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
@@ -678,20 +691,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
         &mut self,
         _pgb: &mut PostgresBackend,
         jwt_response: &[u8],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
             .auth
             .as_ref()
             .unwrap()
-            .decode(str::from_utf8(jwt_response)?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
 
-        if matches!(data.claims.scope, Scope::Tenant) {
-            ensure!(
-                data.claims.tenant_id.is_some(),
+        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
+            return Err(QueryError::Other(anyhow::anyhow!(
                 "jwt token scope is Tenant, but tenant id is missing"
-            )
+            )));
         }
 
         info!(
@@ -703,22 +715,33 @@ impl postgres_backend_async::Handler for PageServerHandler {
         Ok(())
     }
 
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        Ok(())
+    }
+
     async fn process_query(
         &mut self,
         pgb: &mut PostgresBackend,
         query_string: &str,
-    ) -> anyhow::Result<()> {
-        debug!("process query {:?}", query_string);
+    ) -> Result<(), QueryError> {
+        debug!("process query {query_string:?}");
 
         if query_string.starts_with("pagestream ") {
             let (_, params_raw) = query_string.split_at("pagestream ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
-            ensure!(
-                params.len() == 2,
-                "invalid param number for pagestream command"
-            );
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -728,18 +751,24 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() >= 2,
-                "invalid param number for basebackup command"
-            );
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for basebackup command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
 
             let lsn = if params.len() == 3 {
-                Some(Lsn::from_str(params[2])?)
+                Some(
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                )
             } else {
                 None
             };
@@ -754,13 +783,16 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() == 2,
-                "invalid param number for get_last_record_rlsn command"
-            );
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for get_last_record_rlsn command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
             let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
@@ -782,22 +814,31 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("fullbackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() >= 2,
-                "invalid param number for fullbackup command"
-            );
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for fullbackup command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             // The caller is responsible for providing correct lsn and prev_lsn.
             let lsn = if params.len() > 2 {
-                Some(Lsn::from_str(params[2])?)
+                Some(
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                )
             } else {
                 None
             };
             let prev_lsn = if params.len() > 3 {
-                Some(Lsn::from_str(params[3])?)
+                Some(
+                    Lsn::from_str(params[3])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
+                )
             } else {
                 None
             };
@@ -822,12 +863,21 @@ impl postgres_backend_async::Handler for PageServerHandler {
             //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
             let (_, params_raw) = query_string.split_at("import basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
-            ensure!(params.len() == 5);
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
-            let base_lsn = Lsn::from_str(params[2])?;
-            let end_lsn = Lsn::from_str(params[3])?;
-            let pg_version = u32::from_str(params[4])?;
+            if params.len() != 5 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import basebackup command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let base_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+            let pg_version = u32::from_str(params[4])
+                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -845,7 +895,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
+                    pgb.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
                 }
             };
         } else if query_string.starts_with("import wal ") {
@@ -855,11 +908,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // caller should poll the http api to check when that is done.
             let (_, params_raw) = query_string.split_at("import wal ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
-            ensure!(params.len() == 4);
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
-            let start_lsn = Lsn::from_str(params[2])?;
-            let end_lsn = Lsn::from_str(params[3])?;
+            if params.len() != 4 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import wal command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let start_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -870,7 +931,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
+                    pgb.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
                 }
             };
         } else if query_string.to_ascii_lowercase().starts_with("set ") {
@@ -881,8 +945,13 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // show <tenant_id>
             let (_, params_raw) = query_string.split_at("show ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
-            ensure!(params.len() == 1, "invalid param number for config command");
-            let tenant_id = TenantId::from_str(params[0])?;
+            if params.len() != 1 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for config command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -923,7 +992,9 @@ impl postgres_backend_async::Handler for PageServerHandler {
             ]))?
             .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else {
-            bail!("unknown command");
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "unknown command {query_string}"
+            )));
         }
 
         Ok(())
@@ -935,7 +1006,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
+async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
     let tenant = mgr::get_tenant(tenant_id, false).await?;
     match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
         Ok(wait_result) => wait_result
@@ -949,7 +1020,7 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenan
 async fn get_active_timeline_with_timeout(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-) -> Result<Arc<Timeline>> {
+) -> anyhow::Result<Arc<Timeline>> {
     get_active_tenant_with_timeout(tenant_id)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index 06aa132365..aca5e8e019 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -1,6 +1,7 @@
 //! Actual Postgres connection handler to stream WAL to the server.
 
 use std::{
+    error::Error,
     str::FromStr,
     sync::Arc,
     time::{Duration, SystemTime},
@@ -11,7 +12,7 @@ use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
-use postgres::{SimpleQueryMessage, SimpleQueryRow};
+use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_protocol::message::backend::ReplicationMessage;
@@ -32,7 +33,7 @@ use crate::{
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use pq_proto::ReplicationFeedback;
-use utils::lsn::Lsn;
+use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error};
 
 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -68,10 +69,17 @@ pub async fn handle_walreceiver_connection(
         let mut config = wal_source_connconf.to_tokio_postgres_config();
         config.application_name("pageserver");
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
-        time::timeout(connect_timeout, config.connect(postgres::NoTls))
-            .await
-            .context("Timed out while waiting for walreceiver connection to open")?
-            .context("Failed to open walreceiver connection")?
+        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
+            Ok(Ok(client_and_conn)) => client_and_conn,
+            Ok(Err(conn_err)) => {
+                let expected_error = ignore_expected_errors(conn_err)?;
+                info!("DB connection stream finished: {expected_error}");
+                return Ok(());
+            }
+            Err(elapsed) => anyhow::bail!(
+                "Timed out while waiting {elapsed} for walreceiver connection to open"
+            ),
+        }
     };
 
     info!("connected!");
@@ -103,10 +111,8 @@ pub async fn handle_walreceiver_connection(
                 connection_result = connection => match connection_result{
                     Ok(()) => info!("Walreceiver db connection closed"),
                     Err(connection_error) => {
-                        if connection_error.is_closed() {
-                            info!("Connection closed regularly: {connection_error}")
-                        } else {
-                            warn!("Connection aborted: {connection_error}")
+                        if let Err(e) = ignore_expected_errors(connection_error) {
+                            warn!("Connection aborted: {e:#}")
                         }
                     }
                 },
@@ -187,14 +193,9 @@ pub async fn handle_walreceiver_connection(
         let replication_message = match replication_message {
             Ok(message) => message,
             Err(replication_error) => {
-                if replication_error.is_closed() {
-                    info!("Replication stream got closed");
-                    return Ok(());
-                } else {
-                    return Err(
-                        anyhow::Error::new(replication_error).context("replication stream error")
-                    );
-                }
+                let expected_error = ignore_expected_errors(replication_error)?;
+                info!("Replication stream finished: {expected_error}");
+                return Ok(());
             }
         };
 
@@ -400,3 +401,32 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
         Err(IdentifyError.into())
     }
 }
+
+/// We don't want to report connectivity problems as real errors towards connection manager because
+/// 1. they happen frequently enough to make server logs hard to read and
+/// 2. the connection manager can retry other safekeeper.
+///
+/// If this function returns `Ok(pg_error)`, it's such an error.
+/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+/// Connection manager will then handle reconnections.
+///
+/// If this function returns an `Err()`, the caller can bubble it up using `?`.
+/// The connection manager will log the error at ERROR level.
+fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
+    if pg_error.is_closed()
+        || pg_error
+            .source()
+            .and_then(|source| source.downcast_ref::<std::io::Error>())
+            .map(is_expected_io_error)
+            .unwrap_or(false)
+    {
+        return Ok(pg_error);
+    } else if let Some(db_error) = pg_error.as_db_error() {
+        if db_error.code() == &SqlState::CONNECTION_FAILURE
+            && db_error.message().contains("end streaming")
+        {
+            return Ok(pg_error);
+        }
+    }
+    Err(pg_error).context("connection error")
+}
diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs
index 2e0a502e7f..cf83b48ae0 100644
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -9,7 +9,10 @@ use std::{
     thread,
 };
 use tracing::{error, info, info_span};
-use utils::postgres_backend::{self, AuthType, PostgresBackend};
+use utils::{
+    postgres_backend::{self, AuthType, PostgresBackend},
+    postgres_backend_async::QueryError,
+};
 
 /// Console management API listener thread.
 /// It spawns console response handlers needed for the link auth.
@@ -47,7 +50,7 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
     }
 }
 
-fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
+fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
     let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
     pgbackend.run(&mut MgmtHandler)
 }
@@ -58,7 +61,7 @@ pub type ComputeReady = Result<DatabaseInfo, String>;
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
 impl postgres_backend::Handler for MgmtHandler {
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
+    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
         try_process_query(pgb, query).map_err(|e| {
             error!("failed to process response: {e:?}");
             e
@@ -66,8 +69,8 @@ impl postgres_backend::Handler for MgmtHandler {
     }
 }
 
-fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
-    let resp: KickSession = serde_json::from_str(query)?;
+fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
+    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
@@ -81,7 +84,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
         }
         Err(e) => {
             error!("failed to deliver response to per-client task");
-            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?;
         }
     }
 
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 19e1479068..02a0fabe9a 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -2,7 +2,7 @@ use crate::error::UserFacingError;
 use anyhow::bail;
 use bytes::BytesMut;
 use pin_project_lite::pin_project;
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket};
 use rustls::ServerConfig;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -47,18 +47,13 @@ fn err_connection() -> io::Error {
     io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost")
 }
 
-// TODO: change error type of `FeMessage::read_fut`
-fn from_anyhow(e: anyhow::Error) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, e.to_string())
-}
-
 impl<S: AsyncRead + Unpin> PqStream<S> {
     /// Receive [`FeStartupPacket`], which is a first packet sent by a client.
     pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
         // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket`
         let msg = FeStartupPacket::read_fut(&mut self.stream)
             .await
-            .map_err(from_anyhow)?
+            .map_err(ConnectionError::into_io_error)?
             .ok_or_else(err_connection)?;
 
         match msg {
@@ -80,7 +75,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
     async fn read_message(&mut self) -> io::Result<FeMessage> {
         FeMessage::read_fut(&mut self.stream)
             .await
-            .map_err(from_anyhow)?
+            .map_err(ConnectionError::into_io_error)?
             .ok_or_else(err_connection)
     }
 }
@@ -112,7 +107,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
     pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
         tracing::info!("forwarding error to user: {error}");
-        self.write_message(&BeMessage::ErrorResponse(error)).await?;
+        self.write_message(&BeMessage::ErrorResponse(error, None))
+            .await?;
         bail!(error)
     }
 
@@ -124,7 +120,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     {
         let msg = error.to_string_client();
         tracing::info!("forwarding error to user: {msg}");
-        self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
+        self.write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await?;
         bail!(error)
     }
 }
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 394a4815bb..b130ea86bd 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -229,11 +229,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let conf_cloned = conf.clone();
     let safekeeper_thread = thread::Builder::new()
         .name("safekeeper thread".into())
-        .spawn(|| {
-            if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) {
-                info!("safekeeper thread terminated: {e}");
-            }
-        })
+        .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
         .unwrap();
 
     threads.push(safekeeper_thread);
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index c692e9fc12..60df5dd372 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -8,7 +8,7 @@ use crate::receive_wal::ReceiveWalConn;
 use crate::send_wal::ReplicationConn;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 
 use postgres_ffi::PG_TLI;
 use regex::Regex;
@@ -17,6 +17,7 @@ use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use std::str;
 use tracing::info;
 use utils::auth::{Claims, Scope};
+use utils::postgres_backend_async::QueryError;
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -42,7 +43,7 @@ enum SafekeeperPostgresCommand {
     JSONCtrl { cmd: AppendLogicalMessage },
 }
 
-fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
+fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     if cmd.starts_with("START_WAL_PUSH") {
         Ok(SafekeeperPostgresCommand::StartWalPush)
     } else if cmd.starts_with("START_REPLICATION") {
@@ -62,13 +63,17 @@ fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
             cmd: serde_json::from_str(cmd)?,
         })
     } else {
-        bail!("unsupported command {}", cmd);
+        anyhow::bail!("unsupported command {cmd}");
     }
 }
 
 impl postgres_backend::Handler for SafekeeperPostgresHandler {
     // tenant_id and timeline_id are passed in connection string params
-    fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         if let FeStartupPacket::StartupMessage { params, .. } = sm {
             if let Some(options) = params.options_raw() {
                 for opt in options {
@@ -77,10 +82,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
                     // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                     match opt.split_once('=') {
                         Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
-                            self.tenant_id = Some(value.parse()?);
+                            self.tenant_id = Some(value.parse().with_context(|| {
+                                format!("Failed to parse {value} as tenant id")
+                            })?);
                         }
                         Some(("ztimelineid", value)) | Some(("timeline_id", value)) => {
-                            self.timeline_id = Some(value.parse()?);
+                            self.timeline_id = Some(value.parse().with_context(|| {
+                                format!("Failed to parse {value} as timeline id")
+                            })?);
                         }
                         _ => continue,
                     }
@@ -93,7 +102,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
 
             Ok(())
         } else {
-            bail!("Safekeeper received unexpected initial message: {:?}", sm);
+            Err(QueryError::Other(anyhow::anyhow!(
+                "Safekeeper received unexpected initial message: {sm:?}"
+            )))
         }
     }
 
@@ -101,7 +112,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         &mut self,
         _pgb: &mut PostgresBackend,
         jwt_response: &[u8],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
@@ -109,13 +120,12 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
             .auth
             .as_ref()
             .unwrap()
-            .decode(str::from_utf8(jwt_response)?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
 
-        if matches!(data.claims.scope, Scope::Tenant) {
-            ensure!(
-                data.claims.tenant_id.is_some(),
+        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
+            return Err(QueryError::Other(anyhow::anyhow!(
                 "jwt token scope is Tenant, but tenant id is missing"
-            )
+            )));
         }
 
         info!(
@@ -127,7 +137,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         Ok(())
     }
 
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> {
+    fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError> {
         if query_string
             .to_ascii_lowercase()
             .starts_with("set datestyle to ")
@@ -148,19 +162,26 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         self.check_permission(Some(tenant_id))?;
         self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
 
-        match cmd {
+        let res = match cmd {
             SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self),
             SafekeeperPostgresCommand::StartReplication { start_lsn } => {
                 ReplicationConn::new(pgb).run(self, pgb, start_lsn)
             }
             SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb),
             SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd),
-        }
-        .context(format!(
-            "Failed to process query for timeline {timeline_id}"
-        ))?;
+        };
 
-        Ok(())
+        match res {
+            Ok(()) => Ok(()),
+            Err(QueryError::Disconnected(connection_error)) => {
+                info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}");
+                Err(QueryError::Disconnected(connection_error))
+            }
+            Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!(
+                "Failed to process query for timeline {}",
+                self.ttid
+            )))),
+        }
     }
 }
 
@@ -178,7 +199,7 @@ impl SafekeeperPostgresHandler {
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
         if self.conf.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
@@ -196,7 +217,7 @@ impl SafekeeperPostgresHandler {
     ///
     /// Handle IDENTIFY_SYSTEM replication command
     ///
-    fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> {
+    fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> {
         let tli = GlobalTimelines::get(self.ttid)?;
 
         let lsn = if self.is_walproposer_recovery() {
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 746b4461b7..32a24a4978 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,11 +8,12 @@
 
 use std::sync::Arc;
 
-use anyhow::Result;
+use anyhow::Context;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use tracing::*;
 use utils::id::TenantTimelineId;
+use utils::postgres_backend_async::QueryError;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -47,7 +48,7 @@ pub struct AppendLogicalMessage {
     pg_version: u32,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 struct AppendResult {
     // safekeeper state after append
     state: SafeKeeperState,
@@ -62,8 +63,8 @@ pub fn handle_json_ctrl(
     spg: &SafekeeperPostgresHandler,
     pgb: &mut PostgresBackend,
     append_request: &AppendLogicalMessage,
-) -> Result<()> {
-    info!("JSON_CTRL request: {:?}", append_request);
+) -> Result<(), QueryError> {
+    info!("JSON_CTRL request: {append_request:?}");
 
     // need to init safekeeper state before AppendRequest
     let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?;
@@ -78,7 +79,8 @@ pub fn handle_json_ctrl(
         state: tli.get_state().1,
         inserted_wal,
     };
-    let response_data = serde_json::to_vec(&response)?;
+    let response_data = serde_json::to_vec(&response)
+        .with_context(|| format!("Response {response:?} is not a json array"))?;
 
     pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor {
         name: b"json",
@@ -93,7 +95,7 @@ pub fn handle_json_ctrl(
 
 /// Prepare safekeeper to process append requests without crashes,
 /// by sending ProposerGreeting with default server.wal_seg_size.
-fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Timeline>> {
+fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result<Arc<Timeline>> {
     GlobalTimelines::create(
         ttid,
         ServerInfo {
@@ -106,7 +108,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Tim
     )
 }
 
-fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()> {
+fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
     // add new term to existing history
     let history = tli.get_state().1.acceptor_state.term_history;
     let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -125,7 +127,7 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()
     Ok(())
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 struct InsertedWAL {
     begin_lsn: Lsn,
     end_lsn: Lsn,
@@ -134,7 +136,10 @@ struct InsertedWAL {
 
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
-fn append_logical_message(tli: &Arc<Timeline>, msg: &AppendLogicalMessage) -> Result<InsertedWAL> {
+fn append_logical_message(
+    tli: &Arc<Timeline>,
+    msg: &AppendLogicalMessage,
+) -> anyhow::Result<InsertedWAL> {
     let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
     let sk_state = tli.get_state().1;
 
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index be7f071abb..671e5470a0 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -2,11 +2,13 @@
 //! Gets messages from the network, passes them down to consensus module and
 //! sends replies back.
 
-use anyhow::{anyhow, bail, Result};
+use anyhow::anyhow;
+use anyhow::Context;
 
 use bytes::BytesMut;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::postgres_backend_async::QueryError;
 
 use crate::safekeeper::ServerInfo;
 use crate::timeline::Timeline;
@@ -43,7 +45,7 @@ impl<'pg> ReceiveWalConn<'pg> {
     }
 
     // Send message to the postgres
-    fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> {
+    fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> {
         let mut buf = BytesMut::with_capacity(128);
         msg.serialize(&mut buf)?;
         self.pg_backend.write_message(&BeMessage::CopyData(&buf))?;
@@ -51,7 +53,7 @@ impl<'pg> ReceiveWalConn<'pg> {
     }
 
     /// Receive WAL from wal_proposer
-    pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> {
+    pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> {
         let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered();
 
         // Notify the libpq client that it's allowed to send `CopyData` messages
@@ -79,7 +81,11 @@ impl<'pg> ReceiveWalConn<'pg> {
                 };
                 GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)?
             }
-            _ => bail!("unexpected message {:?} instead of greeting", next_msg),
+            _ => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message {next_msg:?} instead of greeting"
+                )))
+            }
         };
 
         let mut next_msg = Some(next_msg);
@@ -134,25 +140,32 @@ impl<'pg> ReceiveWalConn<'pg> {
 
 struct ProposerPollStream {
     msg_rx: Receiver<ProposerAcceptorMessage>,
-    read_thread: Option<thread::JoinHandle<Result<()>>>,
+    read_thread: Option<thread::JoinHandle<Result<(), QueryError>>>,
 }
 
 impl ProposerPollStream {
-    fn new(mut r: ReadStream) -> Result<Self> {
+    fn new(mut r: ReadStream) -> anyhow::Result<Self> {
         let (msg_tx, msg_rx) = channel();
 
         let read_thread = thread::Builder::new()
             .name("Read WAL thread".into())
-            .spawn(move || -> Result<()> {
+            .spawn(move || -> Result<(), QueryError> {
                 loop {
                     let copy_data = match FeMessage::read(&mut r)? {
-                        Some(FeMessage::CopyData(bytes)) => bytes,
-                        Some(msg) => bail!("expected `CopyData` message, found {:?}", msg),
-                        None => bail!("connection closed unexpectedly"),
-                    };
+                        Some(FeMessage::CopyData(bytes)) => Ok(bytes),
+                        Some(msg) => Err(QueryError::Other(anyhow::anyhow!(
+                            "expected `CopyData` message, found {msg:?}"
+                        ))),
+                        None => Err(QueryError::from(std::io::Error::new(
+                            std::io::ErrorKind::ConnectionAborted,
+                            "walproposer closed the connection",
+                        ))),
+                    }?;
 
                     let msg = ProposerAcceptorMessage::parse(copy_data)?;
-                    msg_tx.send(msg)?;
+                    msg_tx
+                        .send(msg)
+                        .context("Failed to send the proposer message")?;
                 }
                 // msg_tx will be dropped here, this will also close msg_rx
             })?;
@@ -163,17 +176,19 @@ impl ProposerPollStream {
         })
     }
 
-    fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage> {
+    fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage, QueryError> {
         self.msg_rx.recv().map_err(|_| {
             // return error from the read thread
             let res = match self.read_thread.take() {
                 Some(thread) => thread.join(),
-                None => return anyhow!("read thread is gone"),
+                None => return QueryError::Other(anyhow::anyhow!("read thread is gone")),
             };
 
             match res {
-                Ok(Ok(())) => anyhow!("unexpected result from read thread"),
-                Err(err) => anyhow!("read thread panicked: {:?}", err),
+                Ok(Ok(())) => {
+                    QueryError::Other(anyhow::anyhow!("unexpected result from read thread"))
+                }
+                Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")),
                 Ok(Err(err)) => err,
             }
         })
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index a054b8fe14..20600ab694 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::timeline::{ReplicaState, Timeline};
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
-use anyhow::{bail, Context, Result};
+use anyhow::Context;
 
 use bytes::Bytes;
 use postgres_ffi::get_current_timestamp;
@@ -15,7 +15,8 @@ use std::cmp::min;
 use std::net::Shutdown;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{str, thread};
+use std::{io, str, thread};
+use utils::postgres_backend_async::QueryError;
 
 use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody};
 use tokio::sync::watch::Receiver;
@@ -91,7 +92,7 @@ impl ReplicationConn {
     fn background_thread(
         mut stream_in: ReadStream,
         replica_guard: Arc<ReplicationConnGuard>,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let replica_id = replica_guard.replica;
         let timeline = &replica_guard.timeline;
 
@@ -140,7 +141,7 @@ impl ReplicationConn {
                     // Shutdown the connection, because rust-postgres client cannot be dropped
                     // when connection is alive.
                     let _ = stream_in.shutdown(Shutdown::Both);
-                    bail!("Copy failed");
+                    anyhow::bail!("Copy failed");
                 }
                 _ => {
                     // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored.
@@ -160,7 +161,7 @@ impl ReplicationConn {
         spg: &mut SafekeeperPostgresHandler,
         pgb: &mut PostgresBackend,
         mut start_pos: Lsn,
-    ) -> Result<()> {
+    ) -> Result<(), QueryError> {
         let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered();
 
         let tli = GlobalTimelines::get(spg.ttid)?;
@@ -256,8 +257,10 @@ impl ReplicationConn {
                         // to right pageserver.
                         if tli.should_walsender_stop(replica_id) {
                             // Shut down, timeline is suspended.
-                            // TODO create proper error type for this
-                            bail!("end streaming to {:?}", spg.appname);
+                            return Err(QueryError::from(io::Error::new(
+                                io::ErrorKind::ConnectionAborted,
+                                format!("end streaming to {:?}", spg.appname),
+                            )));
                         }
 
                         // timeout expired: request pageserver status
@@ -265,8 +268,7 @@ impl ReplicationConn {
                             sent_ptr: end_pos.0,
                             timestamp: get_current_timestamp(),
                             request_reply: true,
-                        }))
-                        .context("Failed to send KeepAlive message")?;
+                        }))?;
                         continue;
                     }
                 }
@@ -301,7 +303,7 @@ impl ReplicationConn {
 const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
 
 // Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn.
-async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> Result<Option<Lsn>> {
+async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
     let commit_lsn: Lsn = *rx.borrow();
     if commit_lsn > lsn {
         return Ok(Some(commit_lsn));
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 0fea00fe1b..3ca651d060 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -2,18 +2,18 @@
 //!   WAL service listens for client connections and
 //!   receive WAL from wal_proposer and send it to WAL receivers
 //!
-use anyhow::Result;
 use regex::Regex;
 use std::net::{TcpListener, TcpStream};
 use std::thread;
 use tracing::*;
+use utils::postgres_backend_async::QueryError;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::SafeKeeperConf;
 use utils::postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
+pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! {
     loop {
         match listener.accept() {
             Ok((socket, peer_addr)) => {
@@ -44,7 +44,7 @@ fn get_tid() -> u64 {
 
 /// This is run by `thread_main` above, inside a background thread.
 ///
-fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
+fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> {
     let _enter = info_span!("", tid = ?get_tid()).entered();
 
     socket.set_nodelay(true)?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 705ab70ab4..eb15278ba7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1903,13 +1903,15 @@ class NeonPageserver(PgProtocol):
             ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
             ".*Shutdown task error: walreceiver connection handling failure.*",
             ".*wal_connection_manager.*tcp connect error: Connection refused.*",
-            ".*query handler for .* failed: Connection reset by peer.*",
-            ".*serving compute connection task.*exited with error: Broken pipe.*",
-            ".*Connection aborted: error communicating with the server: Broken pipe.*",
-            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
-            ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
+            ".*query handler for .* failed: Socket IO error: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Postgres connection error.*",
+            ".*serving compute connection task.*exited with error: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Postgres query error.*",
+            ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*",
+            ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*",
+            ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*",
             ".*kill_and_wait_impl.*: wait successful.*",
-            ".*end streaming to Some.*",
+            ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 77ec33f8b0..72d27c3aba 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1105,7 +1105,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     env.pageserver.allowed_errors.extend(
         [
             ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
-            ".*end streaming to Some.*",
         ]
     )
 

From efad64bc7feec51c23ec6bf3a6ea19797ebdb6a0 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 4 Jan 2023 12:45:11 +0200
Subject: [PATCH 107/132] Expect compute shutdown test log error (#3262)

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3261/debug/3833043374/index.html#suites/ffbb7f9930a77115316b58ff32b7c719/1f6ebaedc0a113a1/

Spotted a flacky test that appeared after
https://github.com/neondatabase/neon/pull/3227 changes
---
 test_runner/fixtures/neon_fixtures.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index eb15278ba7..ba2cce3022 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1913,6 +1913,7 @@ class NeonPageserver(PgProtocol):
             ".*kill_and_wait_impl.*: wait successful.*",
             ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
+            ".*query handler for 'pagestream.*failed: Connection reset by peer (os error 104).*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
             ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",

From 8932d14d505c1ecc04eeec32243397ffd03ffc1c Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 4 Jan 2023 17:31:51 +0200
Subject: [PATCH 108/132] Revert "Run Python tests in 8 threads (#3206)"
 (#3264)

This reverts commit 56a4466d0a85a9498bfd2a78a4ad3a2facb58167.

Seems that flackiness increased after this commit, while the time
decrease was a couple of seconds.
With every regular Python test spawing 1 etcd, 3 safekeepers, 1
pageserver, few CLI commands and post-run cleanup hooks, it might be
hard to run many such tests in parallel.

We could return to this later, after we consider alternative test
structure and/or CI runner structure.
---
 .github/actions/run-python-test-set/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 95167ecf6c..990c7e25a9 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n8 uses eight processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n8 $EXTRA_PARAMS"
+          # -n4 uses four processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist

From f436fb2dfb91292ae59bae7de3a6d41db100683e Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Tue, 3 Jan 2023 11:42:06 +0100
Subject: [PATCH 109/132] Fix panics at compute_ctl:monitor

---
 compute_tools/src/monitor.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 1588f5d62e..c871422e78 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     let mut idle_backs: Vec<DateTime<Utc>> = vec![];
 
                     for b in backs.into_iter() {
-                        let state: String = b.get("state");
-                        let change: String = b.get("state_change");
+                        let state: String = match b.try_get("state") {
+                            Ok(state) => state,
+                            Err(_) => continue,
+                        };
 
                         if state == "idle" {
+                            let change: String = match b.try_get("state_change") {
+                                Ok(state_change) => state_change,
+                                Err(_) => continue,
+                            };
                             let change = DateTime::parse_from_rfc3339(&change);
                             match change {
                                 Ok(t) => idle_backs.push(t.with_timezone(&Utc)),

From 8c6e607327d17b98dd6635d5dc6036d28d1efc04 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Jan 2023 23:03:16 +0200
Subject: [PATCH 110/132] Refactor send_tarball() (#3259)

The Basebackup struct is really just a convenient place to carry the
various parameters around in send_tarball and its subroutines. Make it
internal to the send_tarball function.
---
 pageserver/src/basebackup.rs   | 179 ++++++++++++++++++---------------
 pageserver/src/page_service.rs |   6 +-
 2 files changed, 99 insertions(+), 86 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 4052f13875..1978becf83 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,11 +10,10 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
 use std::fmt::Write as FmtWrite;
-use std::sync::Arc;
 use std::time::SystemTime;
 use tokio::io;
 use tokio::io::AsyncWrite;
@@ -39,114 +38,130 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;
 
+/// Create basebackup with non-rel data in it.
+/// Only include relational data if 'full_backup' is true.
+///
+/// Currently we use empty 'req_lsn' in two cases:
+///  * During the basebackup right after timeline creation
+///  * When working without safekeepers. In this situation it is important to match the lsn
+///    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
+///    to start the replication.
+pub async fn send_basebackup_tarball<'a, W>(
+    write: &'a mut W,
+    timeline: &'a Timeline,
+    req_lsn: Option<Lsn>,
+    prev_lsn: Option<Lsn>,
+    full_backup: bool,
+) -> anyhow::Result<()>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    // Compute postgres doesn't have any previous WAL files, but the first
+    // record that it's going to write needs to include the LSN of the
+    // previous record (xl_prev). We include prev_record_lsn in the
+    // "zenith.signal" file, so that postgres can read it during startup.
+    //
+    // We don't keep full history of record boundaries in the page server,
+    // however, only the predecessor of the latest record on each
+    // timeline. So we can only provide prev_record_lsn when you take a
+    // base backup at the end of the timeline, i.e. at last_record_lsn.
+    // Even at the end of the timeline, we sometimes don't have a valid
+    // prev_lsn value; that happens if the timeline was just branched from
+    // an old LSN and it doesn't have any WAL of its own yet. We will set
+    // prev_lsn to Lsn(0) if we cannot provide the correct value.
+    let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
+        // Backup was requested at a particular LSN. The caller should've
+        // already checked that it's a valid LSN.
+
+        // If the requested point is the end of the timeline, we can
+        // provide prev_lsn. (get_last_record_rlsn() might return it as
+        // zero, though, if no WAL has been generated on this timeline
+        // yet.)
+        let end_of_timeline = timeline.get_last_record_rlsn();
+        if req_lsn == end_of_timeline.last {
+            (end_of_timeline.prev, req_lsn)
+        } else {
+            (Lsn(0), req_lsn)
+        }
+    } else {
+        // Backup was requested at end of the timeline.
+        let end_of_timeline = timeline.get_last_record_rlsn();
+        (end_of_timeline.prev, end_of_timeline.last)
+    };
+
+    // Consolidate the derived and the provided prev_lsn values
+    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
+        if backup_prev != Lsn(0) {
+            ensure!(backup_prev == provided_prev_lsn);
+        }
+        provided_prev_lsn
+    } else {
+        backup_prev
+    };
+
+    info!(
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
+        backup_lsn, prev_lsn, full_backup
+    );
+
+    let basebackup = Basebackup {
+        ar: Builder::new_non_terminated(write),
+        timeline,
+        lsn: backup_lsn,
+        prev_record_lsn: prev_lsn,
+        full_backup,
+    };
+    basebackup
+        .send_tarball()
+        .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
+        .await
+}
+
 /// This is short-living object only for the time of tarball creation,
 /// created mostly to avoid passing a lot of parameters between various functions
 /// used for constructing tarball.
-pub struct Basebackup<'a, W>
+struct Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
     ar: Builder<&'a mut W>,
-    timeline: &'a Arc<Timeline>,
-    pub lsn: Lsn,
+    timeline: &'a Timeline,
+    lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
 }
 
-// Create basebackup with non-rel data in it.
-// Only include relational data if 'full_backup' is true.
-//
-// Currently we use empty lsn in two cases:
-//  * During the basebackup right after timeline creation
-//  * When working without safekeepers. In this situation it is important to match the lsn
-//    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
-//    to start the replication.
 impl<'a, W> Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
-    pub fn new(
-        write: &'a mut W,
-        timeline: &'a Arc<Timeline>,
-        req_lsn: Option<Lsn>,
-        prev_lsn: Option<Lsn>,
-        full_backup: bool,
-    ) -> Result<Basebackup<'a, W>> {
-        // Compute postgres doesn't have any previous WAL files, but the first
-        // record that it's going to write needs to include the LSN of the
-        // previous record (xl_prev). We include prev_record_lsn in the
-        // "zenith.signal" file, so that postgres can read it during startup.
-        //
-        // We don't keep full history of record boundaries in the page server,
-        // however, only the predecessor of the latest record on each
-        // timeline. So we can only provide prev_record_lsn when you take a
-        // base backup at the end of the timeline, i.e. at last_record_lsn.
-        // Even at the end of the timeline, we sometimes don't have a valid
-        // prev_lsn value; that happens if the timeline was just branched from
-        // an old LSN and it doesn't have any WAL of its own yet. We will set
-        // prev_lsn to Lsn(0) if we cannot provide the correct value.
-        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
-            // Backup was requested at a particular LSN. The caller should've
-            // already checked that it's a valid LSN.
-
-            // If the requested point is the end of the timeline, we can
-            // provide prev_lsn. (get_last_record_rlsn() might return it as
-            // zero, though, if no WAL has been generated on this timeline
-            // yet.)
-            let end_of_timeline = timeline.get_last_record_rlsn();
-            if req_lsn == end_of_timeline.last {
-                (end_of_timeline.prev, req_lsn)
-            } else {
-                (Lsn(0), req_lsn)
-            }
-        } else {
-            // Backup was requested at end of the timeline.
-            let end_of_timeline = timeline.get_last_record_rlsn();
-            (end_of_timeline.prev, end_of_timeline.last)
-        };
-
-        // Consolidate the derived and the provided prev_lsn values
-        let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-            if backup_prev != Lsn(0) {
-                ensure!(backup_prev == provided_prev_lsn)
-            }
-            provided_prev_lsn
-        } else {
-            backup_prev
-        };
-
-        info!(
-            "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
-            backup_lsn, prev_lsn, full_backup
-        );
-
-        Ok(Basebackup {
-            ar: Builder::new_non_terminated(write),
-            timeline,
-            lsn: backup_lsn,
-            prev_record_lsn: prev_lsn,
-            full_backup,
-        })
-    }
-
-    pub async fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> anyhow::Result<()> {
         // TODO include checksum
 
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
             let header = new_tar_header_dir(dir)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .context("could not add directory to basebackup tarball")?;
         }
 
-        // Send empty config files.
+        // Send config files.
         for filepath in PGDATA_SPECIAL_FILES.iter() {
             if *filepath == "pg_hba.conf" {
                 let data = PG_HBA.as_bytes();
                 let header = new_tar_header(filepath, data.len() as u64)?;
-                self.ar.append(&header, data).await?;
+                self.ar
+                    .append(&header, data)
+                    .await
+                    .context("could not add config file to basebackup tarball")?;
             } else {
                 let header = new_tar_header(filepath, 0)?;
-                self.ar.append(&header, &mut io::empty()).await?;
+                self.ar
+                    .append(&header, &mut io::empty())
+                    .await
+                    .context("could not add config file to basebackup tarball")?;
             }
         }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 4087a8f90c..b266a07337 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -654,10 +654,8 @@ impl PageServerHandler {
         /* Send a tarball of the latest layer on the timeline */
         {
             let mut writer = pgb.copyout_writer();
-            let basebackup =
-                basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
-            tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
-            basebackup.send_tarball().await?;
+            basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
+                .await?;
         }
 
         pgb.write_message(&BeMessage::CopyDone)?;

From 6a9d1030a687d6c4ebd415f702441f09d679fab4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Jan 2023 17:43:54 +0100
Subject: [PATCH 111/132] use RemoteTimelineClient for downloading index part
 during tenant_attach

Before this change, we would not .measure_remote_op for index part
downloads.

And more generally, it's good to pass not just uploads but also
downloads through RemoteTimelineClient, e.g., if we ever want to
implement some timeline-scoped policies there.

Found this while working on https://github.com/neondatabase/neon/pull/3250
where I add a metric to measure the degree of concurrent downloads.
Layer download was missing in a test that I added there.
---
 pageserver/src/tenant.rs                      | 91 +++++++++++++------
 .../src/tenant/remote_timeline_client.rs      |  6 +-
 .../tenant/remote_timeline_client/download.rs | 43 +++------
 3 files changed, 78 insertions(+), 62 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dcaa8ea268..72404e98cd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -13,11 +13,13 @@
 
 use anyhow::{bail, Context};
 use bytes::Bytes;
+use futures::FutureExt;
 use futures::Stream;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use tokio::sync::watch;
+use tokio::task::JoinSet;
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 
@@ -639,26 +641,62 @@ impl Tenant {
             .as_ref()
             .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
 
-        let remote_timelines = remote_timeline_client::list_remote_timelines(
+        let remote_timeline_ids = remote_timeline_client::list_remote_timelines(
             remote_storage,
             self.conf,
             self.tenant_id,
         )
         .await?;
 
-        info!("found {} timelines", remote_timelines.len());
+        info!("found {} timelines", remote_timeline_ids.len());
 
-        let mut timeline_ancestors: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        let mut index_parts: HashMap<TimelineId, IndexPart> = HashMap::new();
-        for (timeline_id, index_part) in remote_timelines {
-            let remote_metadata = index_part.parse_metadata().with_context(|| {
-                format!(
-                    "Failed to parse metadata file from remote storage for tenant {} timeline {}",
-                    self.tenant_id, timeline_id
-                )
-            })?;
+        // Download & parse index parts
+        let mut part_downloads = JoinSet::new();
+        for timeline_id in remote_timeline_ids {
+            let client = RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            );
+            part_downloads.spawn(
+                async move {
+                    debug!("starting index part download");
+
+                    let index_part = client
+                        .download_index_file()
+                        .await
+                        .context("download index file")?;
+
+                    let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
+
+                    debug!("finished index part download");
+
+                    Result::<_, anyhow::Error>::Ok((
+                        timeline_id,
+                        client,
+                        index_part,
+                        remote_metadata,
+                    ))
+                }
+                .map(move |res| {
+                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
+                })
+                .instrument(info_span!("download_index_part", timeline=%timeline_id)),
+            );
+        }
+        // Wait for all the download tasks to complete & collect results.
+        let mut remote_clients = HashMap::new();
+        let mut index_parts = HashMap::new();
+        let mut timeline_ancestors = HashMap::new();
+        while let Some(result) = part_downloads.join_next().await {
+            // NB: we already added timeline_id as context to the error
+            let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
+            let (timeline_id, client, index_part, remote_metadata) = result?;
+            debug!("successfully downloaded index part for timeline {timeline_id}");
             timeline_ancestors.insert(timeline_id, remote_metadata);
             index_parts.insert(timeline_id, index_part);
+            remote_clients.insert(timeline_id, client);
         }
 
         // For every timeline, download the metadata file, scan the local directory,
@@ -671,7 +709,7 @@ impl Tenant {
                 timeline_id,
                 index_parts.remove(&timeline_id).unwrap(),
                 remote_metadata,
-                remote_storage.clone(),
+                remote_clients.remove(&timeline_id).unwrap(),
             )
             .await
             .with_context(|| {
@@ -714,22 +752,19 @@ impl Tenant {
         Ok(size)
     }
 
-    #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))]
+    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
     async fn load_remote_timeline(
         &self,
         timeline_id: TimelineId,
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
-        remote_storage: GenericRemoteStorage,
+        remote_client: RemoteTimelineClient,
     ) -> anyhow::Result<()> {
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
             .await
             .context("Failed to create new timeline directory")?;
 
-        let remote_client =
-            RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?;
-
         let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
             let timelines = self.timelines.lock().unwrap();
             Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
@@ -986,18 +1021,14 @@ impl Tenant {
             None
         };
 
-        let remote_client = self
-            .remote_storage
-            .as_ref()
-            .map(|remote_storage| {
-                RemoteTimelineClient::new(
-                    remote_storage.clone(),
-                    self.conf,
-                    self.tenant_id,
-                    timeline_id,
-                )
-            })
-            .transpose()?;
+        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
+            RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            )
+        });
 
         let remote_startup_data = match &remote_client {
             Some(remote_client) => match remote_client.download_index_file().await {
@@ -2191,7 +2222,7 @@ impl Tenant {
                 self.conf,
                 tenant_id,
                 new_timeline_id,
-            )?;
+            );
             remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
             Some(remote_client)
         } else {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 45988ff47a..a9f19a4e1d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -298,8 +298,8 @@ impl RemoteTimelineClient {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-    ) -> anyhow::Result<RemoteTimelineClient> {
-        Ok(RemoteTimelineClient {
+    ) -> RemoteTimelineClient {
+        RemoteTimelineClient {
             conf,
             runtime: &BACKGROUND_RUNTIME,
             tenant_id,
@@ -307,7 +307,7 @@ impl RemoteTimelineClient {
             storage_impl: remote_storage,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
             metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
-        })
+        }
     }
 
     /// Initialize the upload queue for a remote storage that already received
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 422728d1f3..2e79698087 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -8,10 +8,9 @@ use std::future::Future;
 use std::path::Path;
 
 use anyhow::{anyhow, Context};
-use futures::stream::{FuturesUnordered, StreamExt};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{debug, error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};
 
 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
@@ -175,7 +174,7 @@ pub async fn list_remote_timelines<'a>(
     storage: &'a GenericRemoteStorage,
     conf: &'static PageServerConf,
     tenant_id: TenantId,
-) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
+) -> anyhow::Result<HashSet<TimelineId>> {
     let tenant_path = conf.timelines_path(&tenant_id);
     let tenant_storage_path = conf.remote_path(&tenant_path)?;
 
@@ -194,7 +193,6 @@ pub async fn list_remote_timelines<'a>(
     }
 
     let mut timeline_ids = HashSet::new();
-    let mut part_downloads = FuturesUnordered::new();
 
     for timeline_remote_storage_key in timelines {
         let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
@@ -205,35 +203,22 @@ pub async fn list_remote_timelines<'a>(
             format!("failed to parse object name into timeline id '{object_name}'")
         })?;
 
-        // list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
-        // yet, launch a download task for it.
-        if !timeline_ids.contains(&timeline_id) {
-            timeline_ids.insert(timeline_id);
-            let storage_clone = storage.clone();
-            part_downloads.push(async move {
-                (
-                    timeline_id,
-                    download_index_part(conf, &storage_clone, tenant_id, timeline_id)
-                        .instrument(info_span!("download_index_part", timeline=%timeline_id))
-                        .await,
-                )
-            });
-        }
+        // list_prefixes is assumed to return unique names. Ensure this here.
+        // NB: it's safer to bail out than warn-log this because the pageserver
+        //     needs to absolutely know about _all_ timelines that exist, so that
+        //     GC knows all the branchpoints. If we skipped over a timeline instead,
+        //     GC could delete a layer that's still needed by that timeline.
+        anyhow::ensure!(
+            !timeline_ids.contains(&timeline_id),
+            "list_prefixes contains duplicate timeline id {timeline_id}"
+        );
+        timeline_ids.insert(timeline_id);
     }
 
-    // Wait for all the download tasks to complete.
-    let mut timeline_parts = Vec::new();
-    while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
-        let index_part = part_upload_result
-            .with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
-
-        debug!("Successfully fetched index part for timeline {timeline_id}");
-        timeline_parts.push((timeline_id, index_part));
-    }
-    Ok(timeline_parts)
+    Ok(timeline_ids)
 }
 
-pub async fn download_index_part(
+pub(super) async fn download_index_part(
     conf: &'static PageServerConf,
     storage: &GenericRemoteStorage,
     tenant_id: TenantId,

From d7f1e301122f7c9f611165baa53b553318c52fcc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 5 Jan 2023 11:50:17 +0100
Subject: [PATCH 112/132] remote_timeline_client: more metrics &
 metrics-related cleanups

- Clean up redundant metric removal in TimelineMetrics::drop.
RemoteTimelineClientMetrics is responsible for cleaning up
REMOTE_OPERATION_TIME andREMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.

- Rename `pageserver_remote_upload_queue_unfinished_tasks` to
`pageserver_remote_timeline_client_calls_unfinished`. The new name
reflects that the metric is with respect to the entire call to remote
timeline client. This includes wait time in the upload queue and hence
it's a longer span than what `pageserver_remote_OPERATION_seconds`
measures.

- Add the `pageserver_remote_timeline_client_calls_started` histogram.
See the metric description for why we need it.

- Add helper functions `call_begin` etc to `RemoteTimelineClientMetrics`
to centralize the logic for updating the metrics above (they relate to
each other, see comments in code).

- Use these constructs to track ongoing downloads in
`pageserver_remote_timeline_client_calls_unfinished`

refs https://github.com/neondatabase/neon/issues/2029
fixes https://github.com/neondatabase/neon/issues/3249
closes https://github.com/neondatabase/neon/pull/3250
---
 pageserver/src/metrics.rs                     | 147 ++++++++++---
 .../src/tenant/remote_timeline_client.rs      |  83 +++++---
 test_runner/fixtures/metrics.py               |   7 +-
 test_runner/fixtures/neon_fixtures.py         |  28 +++
 test_runner/regress/test_remote_storage.py    | 194 ++++++++++++++++--
 5 files changed, 380 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 205ee0ffad..b61e64048b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -209,15 +209,34 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
 
 // remote storage metrics
 
-static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
+/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
+static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
-        "pageserver_remote_upload_queue_unfinished_tasks",
-        "Number of tasks in the upload queue that are not finished yet.",
+        "pageserver_remote_timeline_client_calls_unfinished",
+        "Number of ongoing calls to remote timeline client. \
+         Used to populate pageserver_remote_timeline_client_calls_started. \
+         This metric is not useful for sampling from Prometheus, but useful in tests.",
         &["tenant_id", "timeline_id", "file_kind", "op_kind"],
     )
     .expect("failed to define a metric")
 });
 
+static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_remote_timeline_client_calls_started",
+        "When calling a remote timeline client method, we record the current value \
+         of the calls_unfinished gauge in this histogram. Plot the histogram \
+         over time in a heatmap to visualize how many operations were ongoing \
+         at a given instant. It gives you a better idea of the queue depth \
+         than plotting the gauge directly, since operations may complete faster \
+         than the sampling interval.",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
+        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -248,15 +267,12 @@ impl RemoteOpFileKind {
     }
 }
 
-pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
-pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"];
-pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"];
-
 pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_remote_operation_seconds",
         "Time spent on remote storage operations. \
-        Grouped by tenant, timeline, operation_kind and status",
+        Grouped by tenant, timeline, operation_kind and status. \
+        Does not account for time spent waiting in remote timeline client's queues.",
         &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
     )
     .expect("failed to define a metric")
@@ -475,21 +491,6 @@ impl Drop for TimelineMetrics {
         for op in SMGR_QUERY_TIME_OPERATIONS {
             let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
         }
-
-        let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]);
-        for file_kind in REMOTE_OPERATION_FILE_KINDS {
-            for op in REMOTE_OPERATION_KINDS {
-                for status in REMOTE_OPERATION_STATUSES {
-                    let _ = REMOTE_OPERATION_TIME.remove_label_values(&[
-                        tenant_id,
-                        timeline_id,
-                        file_kind,
-                        op,
-                        status,
-                    ]);
-                }
-            }
-        }
     }
 }
 
@@ -510,7 +511,8 @@ pub struct RemoteTimelineClientMetrics {
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
     remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
-    unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
 }
 
 impl RemoteTimelineClientMetrics {
@@ -519,7 +521,8 @@ impl RemoteTimelineClientMetrics {
             tenant_id: tenant_id.to_string(),
             timeline_id: timeline_id.to_string(),
             remote_operation_time: Mutex::new(HashMap::default()),
-            unfinished_tasks: Mutex::new(HashMap::default()),
+            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls_started_hist: Mutex::new(HashMap::default()),
             remote_physical_size_gauge: Mutex::new(None),
         }
     }
@@ -558,16 +561,37 @@ impl RemoteTimelineClientMetrics {
         });
         metric.clone()
     }
-    pub fn unfinished_tasks(
+    fn calls_unfinished_gauge(
         &self,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntGauge {
         // XXX would be nice to have an upgradable RwLock
-        let mut guard = self.unfinished_tasks.lock().unwrap();
+        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
+            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+
+    fn calls_started_hist(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.calls_started_hist.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
                 .get_metric_with_label_values(&[
                     &self.tenant_id.to_string(),
                     &self.timeline_id.to_string(),
@@ -580,6 +604,58 @@ impl RemoteTimelineClientMetrics {
     }
 }
 
+/// See [`RemoteTimelineClientMetrics::call_begin`].
+#[must_use]
+pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
+
+impl RemoteTimelineClientCallMetricGuard {
+    /// Consume this guard object without decrementing the metric.
+    /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
+    pub fn will_decrement_manually(mut self) {
+        self.0 = None; // prevent drop() from decrementing
+    }
+}
+
+impl Drop for RemoteTimelineClientCallMetricGuard {
+    fn drop(&mut self) {
+        if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
+            guard.dec();
+        }
+    }
+}
+
+impl RemoteTimelineClientMetrics {
+    /// Increment the metrics that track ongoing calls to the remote timeline client instance.
+    ///
+    /// Drop the returned guard object once the operation is finished to decrement the values.
+    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
+    /// is more suitable.
+    /// Never do both.
+    pub(crate) fn call_begin(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> RemoteTimelineClientCallMetricGuard {
+        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        self.calls_started_hist(file_kind, op_kind)
+            .observe(unfinished_metric.get() as f64);
+        unfinished_metric.inc();
+        RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
+    }
+
+    /// Manually decrement the metric instead of using the guard object.
+    /// Using the guard object is generally preferable.
+    /// See [`call_begin`] for more context.
+    pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
+        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        debug_assert!(
+            unfinished_metric.get() > 0,
+            "begin and end should cancel out"
+        );
+        unfinished_metric.dec();
+    }
+}
+
 impl Drop for RemoteTimelineClientMetrics {
     fn drop(&mut self) {
         let RemoteTimelineClientMetrics {
@@ -587,13 +663,22 @@ impl Drop for RemoteTimelineClientMetrics {
             timeline_id,
             remote_physical_size_gauge,
             remote_operation_time,
-            unfinished_tasks,
+            calls_unfinished_gauge,
+            calls_started_hist,
         } = self;
         for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
             let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
         }
-        for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
-            let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
+        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
                 tenant_id,
                 timeline_id,
                 a,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index a9f19a4e1d..1db69d8b73 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -367,6 +367,10 @@ impl RemoteTimelineClient {
 
     /// Download index file
     pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self
+            .metrics
+            .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
+
         download::download_index_part(
             self.conf,
             &self.storage_impl,
@@ -393,22 +397,27 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
-        let downloaded_size = download::download_layer_file(
-            self.conf,
-            &self.storage_impl,
-            self.tenant_id,
-            self.timeline_id,
-            layer_file_name,
-            layer_metadata,
-        )
-        .measure_remote_op(
-            self.tenant_id,
-            self.timeline_id,
-            RemoteOpFileKind::Layer,
-            RemoteOpKind::Download,
-            Arc::clone(&self.metrics),
-        )
-        .await?;
+        let downloaded_size = {
+            let _unfinished_gauge_guard = self
+                .metrics
+                .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
+            download::download_layer_file(
+                self.conf,
+                &self.storage_impl,
+                self.tenant_id,
+                self.timeline_id,
+                layer_file_name,
+                layer_metadata,
+            )
+            .measure_remote_op(
+                self.tenant_id,
+                self.timeline_id,
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Download,
+                Arc::clone(&self.metrics),
+            )
+            .await?
+        };
 
         // Update the metadata for given layer file. The remote index file
         // might be missing some information for the file; this allows us
@@ -517,7 +526,7 @@ impl RemoteTimelineClient {
             metadata_bytes,
         );
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-        self.update_upload_queue_unfinished_metric(1, &op);
+        self.calls_unfinished_metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
@@ -549,7 +558,7 @@ impl RemoteTimelineClient {
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
-        self.update_upload_queue_unfinished_metric(1, &op);
+        self.calls_unfinished_metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
 
         info!(
@@ -601,7 +610,7 @@ impl RemoteTimelineClient {
             // schedule the actual deletions
             for name in names {
                 let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
-                self.update_upload_queue_unfinished_metric(1, &op);
+                self.calls_unfinished_metric_begin(&op);
                 upload_queue.queued_operations.push_back(op);
                 info!("scheduled layer file deletion {}", name.file_name());
             }
@@ -753,7 +762,7 @@ impl RemoteTimelineClient {
             // upload finishes or times out soon enough.
             if task_mgr::is_shutdown_requested() {
                 info!("upload task cancelled by shutdown request");
-                self.update_upload_queue_unfinished_metric(-1, &task.op);
+                self.calls_unfinished_metric_end(&task.op);
                 self.stop();
                 return;
             }
@@ -901,22 +910,40 @@ impl RemoteTimelineClient {
             // Launch any queued tasks that were unblocked by this one.
             self.launch_queued_tasks(upload_queue);
         }
-        self.update_upload_queue_unfinished_metric(-1, &task.op);
+        self.calls_unfinished_metric_end(&task.op);
     }
 
-    fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) {
-        let (file_kind, op_kind) = match op {
+    fn calls_unfinished_metric_impl(
+        &self,
+        op: &UploadOp,
+    ) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
+        let res = match op {
             UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
             UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
             UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
             UploadOp::Barrier(_) => {
                 // we do not account these
-                return;
+                return None;
             }
         };
-        self.metrics
-            .unfinished_tasks(&file_kind, &op_kind)
-            .add(delta)
+        Some(res)
+    }
+
+    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        let guard = self.metrics.call_begin(&file_kind, &op_kind);
+        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
+    }
+
+    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        self.metrics.call_end(&file_kind, &op_kind);
     }
 
     fn stop(&self) {
@@ -967,7 +994,7 @@ impl RemoteTimelineClient {
 
                 // Tear down queued ops
                 for op in qi.queued_operations.into_iter() {
-                    self.update_upload_queue_unfinished_metric(-1, &op);
+                    self.calls_unfinished_metric_end(&op);
                     // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                     // which is exactly what we want to happen.
                     drop(op);
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 9236137d19..8b78e06c22 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -40,10 +40,9 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
 
 
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
-    "pageserver_remote_upload_queue_unfinished_tasks",
-    "pageserver_remote_operation_seconds_bucket",
-    "pageserver_remote_operation_seconds_count",
-    "pageserver_remote_operation_seconds_sum",
+    "pageserver_remote_timeline_client_calls_unfinished",
+    *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
+    *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
     "pageserver_remote_physical_size",
 )
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ba2cce3022..481f46ff55 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -34,6 +34,7 @@ from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -1409,6 +1410,33 @@ class PageserverHttpClient(requests.Session):
         ]
         return sample.value
 
+    def get_remote_timeline_client_metric(
+        self,
+        metric_name: str,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        file_kind: str,
+        op_kind: str,
+    ) -> Optional[float]:
+        metrics = parse_metrics(self.get_metrics(), "pageserver")
+        matches = metrics.query_all(
+            name=metric_name,
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        )
+        if len(matches) == 0:
+            value = None
+        elif len(matches) == 1:
+            value = matches[0].value
+            assert value is not None
+        else:
+            assert len(matches) < 2, "above filter should uniquely identify metric"
+        return value
+
     def get_metric_value(self, name: str) -> Optional[str]:
         metrics = self.get_metrics()
         relevant = [line for line in metrics.splitlines() if line.startswith(name)]
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 32c25b2e8c..82bf741a8f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,11 +2,11 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
 import os
-import re
 import shutil
 import threading
 import time
 from pathlib import Path
+from typing import Dict, List, Tuple
 
 import pytest
 from fixtures.log_helper import log
@@ -271,14 +271,15 @@ def test_remote_storage_upload_queue_retries(
         wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
-        metrics = client.get_metrics()
-        matches = re.search(
-            f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
-            metrics,
-            re.MULTILINE,
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
         )
-        assert matches
-        return int(matches[1])
+        assert val is not None, "expecting metric to be present"
+        return int(val)
 
     # create some layers & wait for uploads to finish
     overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
@@ -368,6 +369,168 @@ def test_remote_storage_upload_queue_retries(
         assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
 
 
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_remote_timeline_client_calls_started_metric(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_timeline_client_metrics",
+    )
+
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": f"{128 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{128 * 1024}",
+            # no PITR horizon, we specify the horizon when we request on-demand GC
+            "pitr_interval": "0s",
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # don't create image layers, that causes just noise
+            "image_creation_threshold": "10000",
+        }
+    )
+
+    client = env.pageserver.http_client()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+
+    def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
+        # create initial set of layers & upload them with failpoints configured
+        pg.safe_psql_many(
+            [
+                f"""
+               INSERT INTO foo (id, val)
+               SELECT g, '{data}'
+               FROM generate_series(1, 10000) g
+               ON CONFLICT (id) DO UPDATE
+               SET val = EXCLUDED.val
+               """,
+                # to ensure that GC can actually remove some layers
+                "VACUUM foo",
+            ]
+        )
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+
+    def get_queued_count(file_kind, op_kind):
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
+        )
+        if val is None:
+            return val
+        return int(val)
+
+    def wait_upload_queue_empty():
+        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
+        wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
+        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+
+    calls_started: Dict[Tuple[str, str], List[int]] = {
+        ("layer", "upload"): [0],
+        ("index", "upload"): [0],
+        ("layer", "delete"): [0],
+    }
+
+    def fetch_calls_started():
+        for (file_kind, op_kind), observations in calls_started.items():
+            val = client.get_remote_timeline_client_metric(
+                "pageserver_remote_timeline_client_calls_started_count",
+                tenant_id,
+                timeline_id,
+                file_kind,
+                op_kind,
+            )
+            assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
+            val = int(val)
+            observations.append(val)
+
+    def ensure_calls_started_grew():
+        for (file_kind, op_kind), observations in calls_started.items():
+            log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}")
+            assert all(
+                x < y for x, y in zip(observations, observations[1:])
+            ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
+
+    def churn(data_pass1, data_pass2):
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_compact(tenant_id, timeline_id)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_compact(tenant_id, timeline_id)
+        gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+    # create some layers & wait for uploads to finish
+    churn("a", "b")
+
+    wait_upload_queue_empty()
+
+    # ensure that we updated the calls_started metric
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+    # more churn to cause more operations
+    churn("c", "d")
+
+    # ensure that the calls_started metric continued to be updated
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+    ### now we exercise the download path
+    calls_started.clear()
+    calls_started.update(
+        {
+            ("index", "download"): [0],
+            ("layer", "download"): [0],
+        }
+    )
+
+    env.pageserver.stop(immediate=True)
+    env.postgres.stop_all()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+    client = env.pageserver.http_client()
+
+    client.tenant_attach(tenant_id)
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
+    wait_until(30, 1, tenant_active)
+
+    log.info("restarting postgres to validate")
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+
+    # ensure that we updated the calls_started download metric
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+
 # Test that we correctly handle timeline with layers stuck in upload queue
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_timeline_deletion_with_files_stuck_in_upload_queue(
@@ -401,15 +564,14 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     client = env.pageserver.http_client()
 
     def get_queued_count(file_kind, op_kind):
-        metrics = client.get_metrics()
-        matches = re.search(
-            f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
-            metrics,
-            re.MULTILINE,
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
         )
-        if matches is None:
-            return None
-        return int(matches[1])
+        return int(val) if val is not None else val
 
     pg = env.postgres.create_start("main", tenant_id=tenant_id)
 

From 8712e1899e89ab0fd91296371bf4c3ad8c2bf8e8 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 5 Jan 2023 17:48:27 +0200
Subject: [PATCH 113/132] Move initial timeline creation into pytest (#3270)

For every Python test, we start the storage first, and expect that
later, in the test, when we start a compute, it will work without
specific timeline and tenant creation or their IDs specified.

For that, we have a concept of "default" branch that was created on the
control plane level first, but that's not needed at all, given that it's
only Python tests that need it: let them create the initial timeline
during set-up.

Before, control plane started and stopped pageserver for timeline
creation, now Python harness runs an extra tenant creation request on
test env init.

I had to adjust the metrics test, turns out it registered the metrics
from the default tenant after an extra pageserver restart.
New model does not sent the metrics before the collection time happens,
and that was 30s before.
---
 control_plane/src/background_process.rs       | 16 ----
 control_plane/src/bin/neon_local.rs           | 23 +-----
 control_plane/src/pageserver.rs               | 76 +------------------
 libs/utils/src/postgres_backend_async.rs      |  5 +-
 test_runner/fixtures/neon_fixtures.py         | 26 ++++---
 test_runner/regress/test_metric_collection.py | 71 ++++++++---------
 test_runner/regress/test_neon_local_cli.py    |  2 +-
 test_runner/regress/test_recovery.py          |  4 +-
 8 files changed, 66 insertions(+), 157 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 8909e27c94..1f3f8f45ea 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -136,22 +136,6 @@ where
     anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }
 
-/// Send SIGTERM to child process
-pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
-    let pid = child.id();
-    match kill(
-        nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
-        Signal::SIGTERM,
-    ) {
-        Ok(()) => Ok(()),
-        Err(Errno::ESRCH) => {
-            println!("child process with pid {pid} does not exist");
-            Ok(())
-        }
-        Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
-    }
-}
-
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
 pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
     let pid = match pid_file::read(pid_file)
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 71de741640..e4d0680c9e 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }
 
 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let initial_timeline_id_arg = parse_timeline_id(init_match)?;
-
     // Create config file
     let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
         // load and parse the file
@@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
         LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
     env.init(pg_version)
         .context("Failed to initialize neon repository")?;
-    let initial_tenant_id = env
-        .default_tenant_id
-        .expect("default_tenant_id should be generated by the `env.init()` call above");
 
     // Initialize pageserver, create initial tenant and timeline.
     let pageserver = PageServerNode::from_env(&env);
-    let initial_timeline_id = pageserver
-        .initialize(
-            Some(initial_tenant_id),
-            initial_timeline_id_arg,
-            &pageserver_config_overrides(init_match),
-            pg_version,
-        )
+    pageserver
+        .initialize(&pageserver_config_overrides(init_match))
         .unwrap_or_else(|e| {
             eprintln!("pageserver init failed: {e:?}");
             exit(1);
         });
 
-    env.register_branch_mapping(
-        DEFAULT_BRANCH_NAME.to_owned(),
-        initial_tenant_id,
-        initial_timeline_id,
-    )?;
-
     Ok(env)
 }
 
@@ -928,9 +912,8 @@ fn cli() -> Command {
         .version(GIT_VERSION)
         .subcommand(
             Command::new("init")
-                .about("Initialize a new Neon repository")
+                .about("Initialize a new Neon repository, preparing configs for services to start with")
                 .arg(pageserver_config_args.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                 .arg(
                     Arg::new("config")
                         .long("config")
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 68e94b2fdc..9cebe028e4 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -7,7 +7,7 @@ use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::{io, result};
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 use pageserver_api::models::{
     TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
@@ -130,83 +130,15 @@ impl PageServerNode {
         overrides
     }
 
-    /// Initializes a pageserver node by creating its config with the overrides provided,
-    /// and creating an initial tenant and timeline afterwards.
-    pub fn initialize(
-        &self,
-        create_tenant: Option<TenantId>,
-        initial_timeline_id: Option<TimelineId>,
-        config_overrides: &[&str],
-        pg_version: u32,
-    ) -> anyhow::Result<TimelineId> {
+    /// Initializes a pageserver node by creating its config with the overrides provided.
+    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
         // First, run `pageserver --init` and wait for it to write a config into FS and exit.
         self.pageserver_init(config_overrides).with_context(|| {
             format!(
                 "Failed to run init for pageserver node {}",
                 self.env.pageserver.id,
             )
-        })?;
-
-        // Then, briefly start it fully to run HTTP commands on it,
-        // to create initial tenant and timeline.
-        // We disable the remote storage, since we stop pageserver right after the timeline creation,
-        // hence most of the uploads will either aborted or not started: no point to start them at all.
-        let disabled_remote_storage_override = "remote_storage={}";
-        let mut pageserver_process = self
-            .start_node(
-                &[disabled_remote_storage_override],
-                // Previous overrides will be taken from the config created before, don't overwrite them.
-                false,
-            )
-            .with_context(|| {
-                format!(
-                    "Failed to start a process for pageserver node {}",
-                    self.env.pageserver.id,
-                )
-            })?;
-
-        let init_result = self
-            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
-            .context("Failed to create initial tenant and timeline for pageserver");
-        match &init_result {
-            Ok(initial_timeline_id) => {
-                println!("Successfully initialized timeline {initial_timeline_id}")
-            }
-            Err(e) => eprintln!("{e:#}"),
-        }
-        background_process::send_stop_child_process(&pageserver_process)?;
-
-        let exit_code = pageserver_process.wait()?;
-        ensure!(
-            exit_code.success(),
-            format!(
-                "pageserver init failed with exit code {:?}",
-                exit_code.code()
-            )
-        );
-        println!(
-            "Stopped pageserver {} process with pid {}",
-            self.env.pageserver.id,
-            pageserver_process.id(),
-        );
-        init_result
-    }
-
-    fn try_init_timeline(
-        &self,
-        new_tenant_id: Option<TenantId>,
-        new_timeline_id: Option<TimelineId>,
-        pg_version: u32,
-    ) -> anyhow::Result<TimelineId> {
-        let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
-        let initial_timeline_info = self.timeline_create(
-            initial_tenant_id,
-            new_timeline_id,
-            None,
-            None,
-            Some(pg_version),
-        )?;
-        Ok(initial_timeline_info.timeline_id)
+        })
     }
 
     pub fn repo_path(&self) -> PathBuf {
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index a4f523da04..95b7b3fd15 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -20,7 +20,10 @@ use tokio_rustls::TlsAcceptor;
 
 pub fn is_expected_io_error(e: &io::Error) -> bool {
     use io::ErrorKind::*;
-    matches!(e.kind(), ConnectionRefused | ConnectionAborted)
+    matches!(
+        e.kind(),
+        ConnectionRefused | ConnectionAborted | ConnectionReset
+    )
 }
 
 /// An error, occurred during query processing:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 481f46ff55..97bc694543 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -596,6 +596,7 @@ class NeonEnvBuilder:
         rust_log_override: Optional[str] = None,
         default_branch_name: str = DEFAULT_BRANCH_NAME,
         preserve_database_files: bool = False,
+        initial_tenant: Optional[TenantId] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -618,8 +619,9 @@ class NeonEnvBuilder:
         self.pg_distrib_dir = pg_distrib_dir
         self.pg_version = pg_version
         self.preserve_database_files = preserve_database_files
+        self.initial_tenant = initial_tenant or TenantId.generate()
 
-    def init(self) -> NeonEnv:
+    def init_configs(self) -> NeonEnv:
         # Cannot create more than one environment from one builder
         assert self.env is None, "environment already initialized"
         self.env = NeonEnv(self)
@@ -630,8 +632,17 @@ class NeonEnvBuilder:
         self.env.start()
 
     def init_start(self) -> NeonEnv:
-        env = self.init()
+        env = self.init_configs()
         self.start()
+
+        # Prepare the default branch to start the postgres on later.
+        # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
+        log.info(
+            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
+        )
+        initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
+
         return env
 
     def enable_remote_storage(
@@ -890,12 +901,12 @@ class NeonEnv:
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
         # so that we don't need to dig it out of the config file afterwards.
-        self.initial_tenant = TenantId.generate()
+        self.initial_tenant = config.initial_tenant
 
         # Create a config file corresponding to the options
         toml = textwrap.dedent(
             f"""
-            default_tenant_id = '{self.initial_tenant}'
+            default_tenant_id = '{config.initial_tenant}'
         """
         )
 
@@ -1724,17 +1735,12 @@ class NeonCli(AbstractNeonCli):
     def init(
         self,
         config_toml: str,
-        initial_timeline_id: Optional[TimelineId] = None,
     ) -> "subprocess.CompletedProcess[str]":
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
             tmp.write(config_toml)
             tmp.flush()
 
-            cmd = ["init", f"--config={tmp.name}"]
-            if initial_timeline_id:
-                cmd.extend(["--timeline-id", str(initial_timeline_id)])
-
-            cmd.extend(["--pg-version", self.env.pg_version])
+            cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
 
             append_pageserver_param_overrides(
                 params_to_update=cmd,
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 0fff86f268..d1fcab7a62 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -1,3 +1,5 @@
+import time
+
 import pytest
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
@@ -20,9 +22,19 @@ def httpserver_listen_address(port_distributor: PortDistributor):
     return ("localhost", port)
 
 
-num_metrics_received = 0
+initial_tenant = TenantId.generate()
 remote_uploaded = 0
-first_request = True
+checks = {
+    "written_size": lambda value: value > 0,
+    "resident_size": lambda value: value >= 0,
+    # >= 0 check here is to avoid race condition when we receive metrics before
+    # remote_uploaded is updated
+    "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+    # logical size may lag behind the actual size, so allow 0 here
+    "timeline_logical_size": lambda value: value >= 0,
+}
+
+metric_kinds_checked = set([])
 
 
 #
@@ -36,38 +48,19 @@ def metrics_handler(request: Request) -> Response:
     log.info("received events:")
     log.info(events)
 
-    checks = {
-        "written_size": lambda value: value > 0,
-        "resident_size": lambda value: value >= 0,
-        # >= 0 check here is to avoid race condition when we receive metrics before
-        # remote_uploaded is updated
-        "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
-        # logical size may lag behind the actual size, so allow 0 here
-        "timeline_logical_size": lambda value: value >= 0,
-    }
-
-    events_received = 0
     for event in events:
-        check = checks.get(event["metric"])
+        assert event["tenant_id"] == str(
+            initial_tenant
+        ), "Expecting metrics only from the initial tenant"
+        metric_name = event["metric"]
+
+        check = checks.get(metric_name)
         # calm down mypy
         if check is not None:
-            assert check(event["value"]), f"{event['metric']} isn't valid"
-            events_received += 1
+            assert check(event["value"]), f"{metric_name} isn't valid"
+            global metric_kinds_checked
+            metric_kinds_checked.add(metric_name)
 
-    global first_request
-    # check that all checks were sent
-    # but only on the first request, because we don't send non-changed metrics
-    if first_request:
-        # we may receive more metrics than we check,
-        # because there are two timelines
-        # and we may receive per-timeline metrics from both
-        # if the test was slow enough for these metrics to be collected
-        # -1 because that is ok to not receive timeline_logical_size
-        assert events_received >= len(checks) - 1
-        first_request = False
-
-    global num_metrics_received
-    num_metrics_received += 1
     return Response(status=200)
 
 
@@ -83,11 +76,14 @@ def test_metric_collection(
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
 
+    # Require collecting metrics frequently, since we change
+    # the timeline and want something to be logged about it.
+    #
     # Disable time-based pitr, we will use the manual GC calls
     # to trigger remote storage operations in a controlled way
     neon_env_builder.pageserver_config_override = (
         f"""
-        metric_collection_interval="60s"
+        metric_collection_interval="1s"
         metric_collection_endpoint="{metric_collection_endpoint}"
     """
         + "tenant_config={pitr_interval = '0 sec'}"
@@ -100,6 +96,9 @@ def test_metric_collection(
 
     log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
 
+    # Set initial tenant of the test, that we expect the logs from
+    global initial_tenant
+    initial_tenant = neon_env_builder.initial_tenant
     # mock http server that returns OK for the metrics
     httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
         metrics_handler
@@ -154,7 +153,11 @@ def test_metric_collection(
         remote_uploaded = get_num_remote_ops("index", "upload")
         assert remote_uploaded > 0
 
-    # check that all requests are served
+    # wait longer than collecting interval and check that all requests are served
+    time.sleep(3)
     httpserver.check()
-    global num_metrics_received
-    assert num_metrics_received > 0, "no metrics were received"
+    global metric_kinds_checked, checks
+    expected_checks = set(checks.keys())
+    assert len(metric_kinds_checked) == len(
+        checks
+    ), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered"
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 6c7cdb6f7f..e8f01ccf55 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -4,7 +4,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 # Test that neon cli is able to start and stop all processes with the user defaults.
 # def test_neon_cli_basics(neon_simple_env: NeonEnv):
 def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init()
+    env = neon_env_builder.init_configs()
 
     env.neon_cli.start()
     env.neon_cli.stop()
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 1e93958e98..09644eaaa1 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -12,11 +12,9 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     # Override default checkpointer settings to run it more often
     neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
 
-    env = neon_env_builder.init()
+    env = neon_env_builder.init_start()
     env.pageserver.is_testing_enabled_or_skip()
 
-    neon_env_builder.start()
-
     # These warnings are expected, when the pageserver is restarted abruptly
     env.pageserver.allowed_errors.append(".*found future delta layer.*")
     env.pageserver.allowed_errors.append(".*found future image layer.*")

From c187de1101744c57af041427199eab9630643951 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 5 Jan 2023 14:00:27 +0200
Subject: [PATCH 114/132] Copy error message before it's freed.

pageserver_disconnect() call invalidates 'pageserver_conn', including
the error message pointer we got from PQerrorMessage(pageserver_conn).
Copy the message to a temporary variable before disconnecting, like
we do in a few other places.

In the passing, clear 'pageserver_conn_wes' variable in a few places
where it was free'd. I didn't see any live bug from this, but since
pageserver_disconnect() checks if it's NULL, let's not leave it
dangling to already-free'd memory.
---
 pgxn/neon/libpagestore.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index c6199dddc0..0760842627 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -111,6 +111,7 @@ pageserver_connect()
 				PQfinish(pageserver_conn);
 				pageserver_conn = NULL;
 				FreeWaitEventSet(pageserver_conn_wes);
+				pageserver_conn_wes = NULL;
 
 				neon_log(ERROR, "could not complete handshake with pageserver: %s",
 						 msg);
@@ -179,7 +180,10 @@ pageserver_disconnect(void)
 		prefetch_on_ps_disconnect();
 	}
 	if (pageserver_conn_wes != NULL)
+	{
 		FreeWaitEventSet(pageserver_conn_wes);
+		pageserver_conn_wes = NULL;
+	}
 }
 
 static void
@@ -206,7 +210,7 @@ pageserver_send(NeonRequest * request)
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
-		char	   *msg = PQerrorMessage(pageserver_conn);
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		pageserver_disconnect();
 		neon_log(ERROR, "failed to send page request: %s", msg);

From 8b710b9753846515bc1d2dddd0154dcfcf1beaf9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 5 Jan 2023 14:45:28 +0200
Subject: [PATCH 115/132] Fix segfault if pageserver connection is lost during
 backend startup.

It's not OK to return early from within a PG_TRY-CATCH block. The
PG_TRY macro sets the global PG_exception_stack variable, and
PG_END_TRY restores it. If we jump out in between with "return NULL",
the PG_exception_stack is left to point to garbage. (I'm surprised the
comments in PG_TRY_CATCH don't warn about this.)

Add test that re-attaches tenant in pageserver while Postgres is
running. If the tenant is detached while compute is connected and
busy running queries, those queries will fail if they try to fetch any
pages. But when the tenant is re-attached, things should start working
again, without disconnecting the client <-> postgres connections.
Without this fix, this reproduced the segfault.

Fixes issue #3231
---
 pgxn/neon/libpagestore.c                  |  36 +++---
 test_runner/regress/test_tenant_detach.py | 149 ++++++++++++++++++++++
 2 files changed, 169 insertions(+), 16 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 0760842627..88e3a12d96 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -243,29 +243,33 @@ pageserver_receive(void)
 	PG_TRY();
 	{
 		/* read response */
-		resp_buff.len = call_PQgetCopyData(&resp_buff.data);
-		resp_buff.cursor = 0;
+		int			rc;
 
-		if (resp_buff.len < 0)
+		rc = call_PQgetCopyData(&resp_buff.data);
+		if (rc >= 0)
 		{
-			if (resp_buff.len == -1)
+			resp_buff.len = rc;
+			resp_buff.cursor = 0;
+			resp = nm_unpack_response(&resp_buff);
+			PQfreemem(resp_buff.data);
+
+			if (message_level_is_interesting(PageStoreTrace))
 			{
-				pageserver_disconnect();
-				return NULL;
+				char	   *msg = nm_to_string((NeonMessage *) resp);
+
+				neon_log(PageStoreTrace, "got response: %s", msg);
+				pfree(msg);
 			}
-			else if (resp_buff.len == -2)
-				neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 		}
-		resp = nm_unpack_response(&resp_buff);
-		PQfreemem(resp_buff.data);
-
-		if (message_level_is_interesting(PageStoreTrace))
+		else if (rc == -1)
 		{
-			char	   *msg = nm_to_string((NeonMessage *) resp);
-
-			neon_log(PageStoreTrace, "got response: %s", msg);
-			pfree(msg);
+			pageserver_disconnect();
+			resp = NULL;
 		}
+		else if (rc == -2)
+			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+		else
+			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
 	PG_CATCH();
 	{
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 6963a57542..db5bb679f2 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,9 +1,13 @@
+import asyncio
+import random
 import time
 from threading import Thread
 
+import asyncpg
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
     PageserverApiException,
     PageserverHttpClient,
@@ -12,6 +16,7 @@ from fixtures.neon_fixtures import (
     available_remote_storages,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until,
     wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -84,6 +89,150 @@ def test_tenant_reattach(
     assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
+num_connections = 10
+num_rows = 100000
+updates_to_perform = 0
+
+updates_started = 0
+updates_finished = 0
+
+
+# Run random UPDATEs on test table. On failure, try again.
+async def update_table(pg_conn: asyncpg.Connection):
+    global updates_started, updates_finished, updates_to_perform
+
+    while updates_started < updates_to_perform or updates_to_perform == 0:
+        updates_started += 1
+        id = random.randrange(1, num_rows)
+
+        # Loop to retry until the UPDATE succeeds
+        while True:
+            try:
+                await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
+                updates_finished += 1
+                if updates_finished % 1000 == 0:
+                    log.info(f"update {updates_finished} / {updates_to_perform}")
+                break
+            except asyncpg.PostgresError as e:
+                # Received error from Postgres. Log it, sleep a little, and continue
+                log.info(f"UPDATE error: {e}")
+                await asyncio.sleep(0.1)
+
+
+async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    global updates_started, updates_finished, updates_to_perform
+
+    # Wait until we have performed some updates
+    wait_until(20, 0.5, lambda: updates_finished > 500)
+
+    log.info("Detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    await asyncio.sleep(1)
+    log.info("Re-attaching tenant")
+    pageserver_http.tenant_attach(tenant_id)
+    log.info("Re-attach finished")
+
+    # Continue with 5000 more updates
+    updates_to_perform = updates_started + 5000
+
+
+# async guts of test_tenant_reattach_while_bysy test
+async def reattach_while_busy(
+    env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId
+):
+    workers = []
+    for worker_id in range(num_connections):
+        pg_conn = await pg.connect_async()
+        workers.append(asyncio.create_task(update_table(pg_conn)))
+
+    workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
+    await asyncio.gather(*workers)
+
+    assert updates_finished == updates_to_perform
+
+
+# Detach and re-attach tenant, while compute is busy running queries.
+#
+# Some of the queries may fail, in the window that the tenant has been
+# detached but not yet re-attached. But Postgres itself should keep
+# running, and when we retry the queries, they should start working
+# after the attach has finished.
+
+# FIXME:
+#
+# This is pretty unstable at the moment. I've seen it fail with a warning like this:
+#
+# AssertionError: assert not ['2023-01-05T13:09:40.708303Z  WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n']
+#
+# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/)
+# I believe that failure happened because there is a race condition
+# between detach and starting remote upload tasks:
+#
+# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown
+#    signal to all in-progress tasks associated with the tenant.
+# 2. Just after shutdown_tasks() has collected the list of tasks,
+#    a new remote-upload task is spawned.
+#
+# See https://github.com/neondatabase/neon/issues/3273
+#
+#
+# I also saw this failure:
+#
+# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy
+#     asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run
+#     return loop.run_until_complete(main)
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete
+#     return future.result()
+# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy
+#     assert updates_finished == updates_to_perform
+# E   assert 5010 == 10010
+# E     +5010
+# E     -10010
+#
+# I don't know what's causing that...
+@pytest.mark.skip(reason="fixme")
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_reattach_while_busy(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_reattach_while_busy",
+    )
+    env = neon_env_builder.init_start()
+
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+    env.pageserver.allowed_errors.append(
+        ".*Tenant .* will not become active\\. Current state: Stopping.*"
+    )
+
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        # Create layers aggressively
+        conf={"checkpoint_distance": "100000"}
+    )
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    cur = pg.connect().cursor()
+
+    cur.execute("CREATE TABLE t(id int primary key, counter int)")
+    cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0")
+
+    # Run the test
+    asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+
+    # Verify table contents
+    assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows
+    assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform
+
+
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()

From b6237474d245f8c633752ca809732aef87fb3944 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Fri, 6 Jan 2023 12:26:14 +0200
Subject: [PATCH 116/132] Fix README and basic startup example (#3275)

Follow-up of https://github.com/neondatabase/neon/pull/3270 which made
an example from main README.md not working.

Fixes that, by adding a way to specify a default tenant now and modifies
the basic neon_local test to start postgres and check branching.
Not all neon_local commands are implemented, so not all README.md
contents is tested yet.
---
 README.md                                  |  11 ++-
 control_plane/src/bin/neon_local.rs        |  20 ++++-
 control_plane/src/local_env.rs             |   5 --
 test_runner/fixtures/neon_fixtures.py      | 100 ++++++++-------------
 test_runner/regress/test_neon_local_cli.py |  13 ++-
 5 files changed, 72 insertions(+), 77 deletions(-)

diff --git a/README.md b/README.md
index fa5c1626e4..7b629e71a5 100644
--- a/README.md
+++ b/README.md
@@ -118,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
-pageserver started, pid: 2545906
-Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
-Stopped pageserver 1 process with pid 2545906
 
-# start pageserver and safekeeper
+# start pageserver, safekeeper, and broker for their intercommunication
 > ./target/debug/neon_local start
 Starting neon broker at 127.0.0.1:50051
 storage_broker started, pid: 2918372
@@ -131,6 +128,12 @@ pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
 
+# create initial tenant and use it as a default for every future neon_local invocation
+> ./target/debug/neon_local tenant create --set-default
+tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
+Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
+Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
+
 # start postgres compute node
 > ./target/debug/neon_local pg start main
 Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index e4d0680c9e..4b2aa3c957 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
     } else if let Some(default_id) = env.default_tenant_id {
         Ok(default_id)
     } else {
-        bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
+        anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant");
     }
 }
 
@@ -372,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
             println!(
                 "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
             );
+
+            if create_match.get_flag("set-default") {
+                println!("Setting tenant {new_tenant_id} as a default one");
+                env.default_tenant_id = Some(new_tenant_id);
+            }
+        }
+        Some(("set-default", set_default_match)) => {
+            let tenant_id =
+                parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
+            println!("Setting tenant {tenant_id} as a default one");
+            env.default_tenant_id = Some(tenant_id);
         }
         Some(("config", create_match)) => {
             let tenant_id = get_tenant_id(create_match, env)?;
@@ -975,11 +986,14 @@ fn cli() -> Command {
                 .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
                 .arg(pg_version_arg.clone())
+                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
+                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
                 )
+            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
+                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
-                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
-                )
+                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index ea936640ec..003152c578 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -296,11 +296,6 @@ impl LocalEnv {
             env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
         }
 
-        // If no initial tenant ID was given, generate it.
-        if env.default_tenant_id.is_none() {
-            env.default_tenant_id = Some(TenantId::generate());
-        }
-
         env.base_data_dir = base_path();
 
         Ok(env)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 97bc694543..bdd3dc004e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -18,6 +18,7 @@ from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from enum import Flag, auto
 from functools import cached_property
+from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
@@ -1567,6 +1568,7 @@ class NeonCli(AbstractNeonCli):
         tenant_id: Optional[TenantId] = None,
         timeline_id: Optional[TimelineId] = None,
         conf: Optional[Dict[str, str]] = None,
+        set_default: bool = False,
     ) -> Tuple[TenantId, TimelineId]:
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -1575,47 +1577,51 @@ class NeonCli(AbstractNeonCli):
             tenant_id = TenantId.generate()
         if timeline_id is None:
             timeline_id = TimelineId.generate()
-        if conf is None:
-            res = self.raw_cli(
-                [
-                    "tenant",
-                    "create",
-                    "--tenant-id",
-                    str(tenant_id),
-                    "--timeline-id",
-                    str(timeline_id),
-                    "--pg-version",
-                    self.env.pg_version,
-                ]
-            )
-        else:
-            res = self.raw_cli(
-                [
-                    "tenant",
-                    "create",
-                    "--tenant-id",
-                    str(tenant_id),
-                    "--timeline-id",
-                    str(timeline_id),
-                    "--pg-version",
-                    self.env.pg_version,
-                ]
-                + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
+
+        args = [
+            "tenant",
+            "create",
+            "--tenant-id",
+            str(tenant_id),
+            "--timeline-id",
+            str(timeline_id),
+            "--pg-version",
+            self.env.pg_version,
+        ]
+        if conf is not None:
+            args.extend(
+                chain.from_iterable(
+                    product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
+                )
             )
+        if set_default:
+            args.append("--set-default")
+
+        res = self.raw_cli(args)
         res.check_returncode()
         return tenant_id, timeline_id
 
+    def set_default(self, tenant_id: TenantId):
+        """
+        Update default tenant for future operations that require tenant_id.
+        """
+        res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)])
+        res.check_returncode()
+
     def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]):
         """
         Update tenant config.
         """
-        if conf is None:
-            res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)])
-        else:
-            res = self.raw_cli(
-                ["tenant", "config", "--tenant-id", str(tenant_id)]
-                + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
+
+        args = ["tenant", "config", "--tenant-id", str(tenant_id)]
+        if conf is not None:
+            args.extend(
+                chain.from_iterable(
+                    product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
+                )
             )
+
+        res = self.raw_cli(args)
         res.check_returncode()
 
     def list_tenants(self) -> "subprocess.CompletedProcess[str]":
@@ -1650,36 +1656,6 @@ class NeonCli(AbstractNeonCli):
 
         return TimelineId(str(created_timeline_id))
 
-    def create_root_branch(
-        self,
-        branch_name: str,
-        tenant_id: Optional[TenantId] = None,
-    ):
-        cmd = [
-            "timeline",
-            "create",
-            "--branch-name",
-            branch_name,
-            "--tenant-id",
-            str(tenant_id or self.env.initial_tenant),
-            "--pg-version",
-            self.env.pg_version,
-        ]
-
-        res = self.raw_cli(cmd)
-        res.check_returncode()
-
-        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
-
-        created_timeline_id = None
-        if matches is not None:
-            created_timeline_id = matches.group("timeline_id")
-
-        if created_timeline_id is None:
-            raise Exception("could not find timeline id after `neon timeline create` invocation")
-        else:
-            return TimelineId(created_timeline_id)
-
     def create_branch(
         self,
         new_branch_name: str = DEFAULT_BRANCH_NAME,
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index e8f01ccf55..49c063ce44 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -2,9 +2,16 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 
 
 # Test that neon cli is able to start and stop all processes with the user defaults.
-# def test_neon_cli_basics(neon_simple_env: NeonEnv):
+# Repeats the example from README.md as close as it can
 def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()
+    # Skipping the init step that creates a local tenant in Pytest tests
+    try:
+        env.neon_cli.start()
+        env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
+        env.neon_cli.pg_start(node_name="main")
 
-    env.neon_cli.start()
-    env.neon_cli.stop()
+        env.neon_cli.create_branch(new_branch_name="migration_check")
+        env.neon_cli.pg_start(node_name="migration_check")
+    finally:
+        env.neon_cli.stop()

From df42213dbb5958a50cb6a80743b69d9b50836507 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Jan 2023 14:52:20 +0200
Subject: [PATCH 117/132] Fix missing COMMIT in handle_role_deletions.

There was no COMMIT, so the DROP ROLE commands were always implicitly
rolled back.

Fixes issue #3279.
---
 compute_tools/src/spec.rs | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 58c94d74ae..ce396f4527 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,22 +197,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
 /// Reassign all dependent objects and delete requested roles.
 pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    let spec = &node.spec;
-
-    // First, reassign all dependent objects to db owners.
-    if let Some(ops) = &spec.delta_operations {
+    if let Some(ops) = &node.spec.delta_operations {
+        // First, reassign all dependent objects to db owners.
         info!("reassigning dependent objects of to-be-deleted roles");
         for op in ops {
             if op.action == "delete_role" {
                 reassign_owned_objects(node, &op.name)?;
             }
         }
-    }
 
-    // Second, proceed with role deletions.
-    let mut xact = client.transaction()?;
-    if let Some(ops) = &spec.delta_operations {
+        // Second, proceed with role deletions.
         info!("processing role deletions");
+        let mut xact = client.transaction()?;
         for op in ops {
             // We do not check either role exists or not,
             // Postgres will take care of it for us
@@ -223,6 +219,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
                 xact.execute(query.as_str(), &[])?;
             }
         }
+        xact.commit()?;
     }
 
     Ok(())

From debd134b15083ebd0587b760232724e0a644af31 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 6 Jan 2023 19:34:18 +0400
Subject: [PATCH 118/132] Implement wss support in proxy (#3247)

This is a hacky implementation of WebSocket server, embedded into our
postgres proxy. The server is used to allow https://github.com/neondatabase/serverless
to connect to our postgres from browser and serverless javascript functions.

How it will work (general schema):
- browser opens a websocket connection to
`wss://ep-abc-xyz-123.xx-central-1.aws.neon.tech/`
- proxy accepts this connection and terminates TLS (https)
- inside encrypted tunnel (HTTPS), browser initiates plain
(non-encrypted) postgres connection
- proxy performs auth as in usual plain pg connection and forwards
connection to the compute

Related issue: #3225
---
 Cargo.lock                    |  79 ++++++++++
 proxy/Cargo.toml              |   4 +
 proxy/src/auth/backend.rs     |  32 ++++-
 proxy/src/auth/credentials.rs |  23 ++-
 proxy/src/auth/flow.rs        |  23 +++
 proxy/src/http.rs             |   1 +
 proxy/src/http/websocket.rs   | 263 ++++++++++++++++++++++++++++++++++
 proxy/src/main.rs             |  22 ++-
 proxy/src/proxy.rs            |  43 +++++-
 9 files changed, 476 insertions(+), 14 deletions(-)
 create mode 100644 proxy/src/http/websocket.rs

diff --git a/Cargo.lock b/Cargo.lock
index fbf018e1c0..284a111ba7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1700,6 +1700,19 @@ dependencies = [
  "tokio-io-timeout",
 ]
 
+[[package]]
+name = "hyper-tungstenite"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d62004bcd4f6f85d9e2aa4206f1466ee67031f5ededcb6c6e62d48f9306ad879"
+dependencies = [
+ "hyper",
+ "pin-project",
+ "tokio",
+ "tokio-tungstenite",
+ "tungstenite",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.53"
@@ -2658,6 +2671,7 @@ dependencies = [
  "hex",
  "hmac",
  "hyper",
+ "hyper-tungstenite",
  "itertools",
  "md5",
  "metrics",
@@ -2667,6 +2681,7 @@ dependencies = [
  "pq_proto",
  "rand",
  "rcgen",
+ "regex",
  "reqwest",
  "routerify",
  "rstest",
@@ -2678,6 +2693,7 @@ dependencies = [
  "sha2",
  "socket2",
  "thiserror",
+ "tls-listener",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
@@ -2687,6 +2703,7 @@ dependencies = [
  "url",
  "utils",
  "uuid",
+ "webpki-roots",
  "workspace_hack",
  "x509-parser",
 ]
@@ -3324,6 +3341,17 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "sha-1"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "sha1"
 version = "0.10.5"
@@ -3687,6 +3715,20 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
 
+[[package]]
+name = "tls-listener"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9d4ff21187d434ac7709bfc7441ca88f63681247e5ad99f0f08c8c91ddc103d"
+dependencies = [
+ "futures-util",
+ "hyper",
+ "pin-project-lite",
+ "thiserror",
+ "tokio",
+ "tokio-rustls",
+]
+
 [[package]]
 name = "tokio"
 version = "1.21.1"
@@ -3801,6 +3843,18 @@ dependencies = [
  "xattr",
 ]
 
+[[package]]
+name = "tokio-tungstenite"
+version = "0.17.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f714dd15bead90401d77e04243611caec13726c2408afd5b31901dfcdcb3b181"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.4"
@@ -4027,6 +4081,25 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
 
+[[package]]
+name = "tungstenite"
+version = "0.17.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e27992fd6a8c29ee7eef28fc78349aa244134e10ad447ce3b9f0ac0ed0fa4ce0"
+dependencies = [
+ "base64 0.13.1",
+ "byteorder",
+ "bytes",
+ "http",
+ "httparse",
+ "log",
+ "rand",
+ "sha-1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -4115,6 +4188,12 @@ version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9"
 
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
 [[package]]
 name = "utils"
 version = "0.1.0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0bf47c7b88..cbc067093e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -17,12 +17,14 @@ hashbrown = "0.12"
 hex = "0.4.3"
 hmac = "0.12.1"
 hyper = "0.14"
+hyper-tungstenite = "0.8.1"
 itertools = "0.10.3"
 md5 = "0.7.0"
 once_cell = "1.13.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2.7"
 rand = "0.8.3"
+regex = "1.4.5"
 reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] }
 routerify = "3"
 rustls = "0.20.0"
@@ -36,10 +38,12 @@ thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-rustls = "0.23.0"
+tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] }
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
+webpki-roots = "0.22.5"
 x509-parser = "0.14"
 
 metrics = { path = "../libs/metrics" }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4adf0ed940..e6a179a040 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -149,7 +149,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
         // If there's no project so far, that entails that client doesn't
         // support SNI or other means of passing the project name.
         // We now expect to see a very specific payload in the place of password.
-        let fetch_magic_payload = async {
+        let fetch_magic_payload = |client| async {
             warn!("project name not specified, resorting to the password hack auth flow");
             let payload = AuthFlow::new(client)
                 .begin(auth::PasswordHack)
@@ -161,10 +161,26 @@ impl BackendType<'_, ClientCredentials<'_>> {
             auth::Result::Ok(payload)
         };
 
+        // If we want to use cleartext password flow, we can read the password
+        // from the client and pretend that it's a magic payload (PasswordHack hack).
+        let fetch_plaintext_password = |client| async {
+            info!("using cleartext password flow");
+            let payload = AuthFlow::new(client)
+                .begin(auth::CleartextPassword)
+                .await?
+                .authenticate()
+                .await?;
+
+            auth::Result::Ok(auth::password_hack::PasswordHackPayload {
+                project: String::new(),
+                password: payload,
+            })
+        };
+
         // TODO: find a proper way to merge those very similar blocks.
         let (mut node, payload) = match self {
             Console(endpoint, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload.await?;
+                let payload = fetch_magic_payload(client).await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
@@ -174,8 +190,18 @@ impl BackendType<'_, ClientCredentials<'_>> {
 
                 (node, payload)
             }
+            Console(endpoint, creds) if creds.use_cleartext_password_flow => {
+                // This is a hack to allow cleartext password in secure connections (wss).
+                let payload = fetch_plaintext_password(client).await?;
+                let creds = creds.as_ref();
+                let node = console::Api::new(endpoint, extra, &creds)
+                    .wake_compute()
+                    .await?;
+
+                (node, payload)
+            }
             Postgres(endpoint, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload.await?;
+                let payload = fetch_magic_payload(client).await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 0a3b84bb52..3b71bef9aa 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -34,6 +34,9 @@ pub struct ClientCredentials<'a> {
     pub user: &'a str,
     pub dbname: &'a str,
     pub project: Option<Cow<'a, str>>,
+    /// If `True`, we'll use the old cleartext password flow. This is used for
+    /// websocket connections, which want to minimize the number of round trips.
+    pub use_cleartext_password_flow: bool,
 }
 
 impl ClientCredentials<'_> {
@@ -50,6 +53,7 @@ impl<'a> ClientCredentials<'a> {
             user: self.user,
             dbname: self.dbname,
             project: self.project().map(Cow::Borrowed),
+            use_cleartext_password_flow: self.use_cleartext_password_flow,
         }
     }
 }
@@ -59,6 +63,7 @@ impl<'a> ClientCredentials<'a> {
         params: &'a StartupMessageParams,
         sni: Option<&str>,
         common_name: Option<&str>,
+        use_cleartext_password_flow: bool,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
@@ -108,6 +113,7 @@ impl<'a> ClientCredentials<'a> {
             user = user,
             dbname = dbname,
             project = project.as_deref(),
+            use_cleartext_password_flow = use_cleartext_password_flow,
             "credentials"
         );
 
@@ -115,6 +121,7 @@ impl<'a> ClientCredentials<'a> {
             user,
             dbname,
             project,
+            use_cleartext_password_flow,
         })
     }
 }
@@ -141,7 +148,7 @@ mod tests {
         let options = StartupMessageParams::new([("user", "john_doe")]);
 
         // TODO: check that `creds.dbname` is None.
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
 
         Ok(())
@@ -151,7 +158,7 @@ mod tests {
     fn parse_missing_project() -> anyhow::Result<()> {
         let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
 
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project, None);
@@ -166,7 +173,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -182,7 +189,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -201,7 +208,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -220,7 +227,8 @@ mod tests {
         let sni = Some("second.localhost");
         let common_name = Some("localhost");
 
-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err =
+            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -237,7 +245,8 @@ mod tests {
         let sni = Some("project.localhost");
         let common_name = Some("example.com");
 
-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err =
+            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
         match err {
             InconsistentSni { sni, cn } => {
                 assert_eq!(sni, "project.localhost");
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index d9ee50894d..4b982c0c5e 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -37,6 +37,17 @@ impl AuthMethod for PasswordHack {
     }
 }
 
+/// Use clear-text password auth called `password` in docs
+/// <https://www.postgresql.org/docs/current/auth-password.html>
+pub struct CleartextPassword;
+
+impl AuthMethod for CleartextPassword {
+    #[inline(always)]
+    fn first_message(&self) -> BeMessage<'_> {
+        Be::AuthenticationCleartextPassword
+    }
+}
+
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
 pub struct AuthFlow<'a, Stream, State> {
@@ -86,6 +97,18 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
     }
 }
 
+impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
+    /// Perform user authentication. Raise an error in case authentication failed.
+    pub async fn authenticate(self) -> super::Result<Vec<u8>> {
+        let msg = self.stream.read_password_message().await?;
+        let password = msg
+            .strip_suffix(&[0])
+            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+
+        Ok(password.to_vec())
+    }
+}
+
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 096a33d73d..e847edc8bd 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -1,4 +1,5 @@
 pub mod server;
+pub mod websocket;
 
 use crate::url::ApiUrl;
 
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
new file mode 100644
index 0000000000..33c2752307
--- /dev/null
+++ b/proxy/src/http/websocket.rs
@@ -0,0 +1,263 @@
+use bytes::{Buf, Bytes};
+use futures::{Sink, Stream, StreamExt};
+use hyper::server::accept::{self};
+use hyper::server::conn::AddrIncoming;
+use hyper::upgrade::Upgraded;
+use hyper::{Body, Request, Response, StatusCode};
+use hyper_tungstenite::{tungstenite, WebSocketStream};
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket};
+use pin_project_lite::pin_project;
+use tokio::net::TcpListener;
+
+use std::convert::Infallible;
+use std::future::ready;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use tls_listener::TlsListener;
+
+use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
+
+use tracing::{error, info, info_span, warn, Instrument};
+use utils::http::{error::ApiError, json::json_response};
+
+use crate::cancellation::CancelMap;
+use crate::config::ProxyConfig;
+use crate::proxy::handle_ws_client;
+
+pin_project! {
+    /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite.
+    pub struct WebSocketRW {
+        #[pin]
+        stream: WebSocketStream<Upgraded>,
+        chunk: Option<bytes::Bytes>,
+    }
+}
+
+// FIXME: explain why this is safe or try to remove `unsafe impl`.
+unsafe impl Sync for WebSocketRW {}
+
+impl WebSocketRW {
+    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+        Self {
+            stream,
+            chunk: None,
+        }
+    }
+
+    fn has_chunk(&self) -> bool {
+        if let Some(ref chunk) = self.chunk {
+            chunk.remaining() > 0
+        } else {
+            false
+        }
+    }
+}
+
+fn ws_err_into(e: tungstenite::Error) -> io::Error {
+    io::Error::new(io::ErrorKind::Other, e.to_string())
+}
+
+impl AsyncWrite for WebSocketRW {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        let mut this = self.project();
+        match this.stream.as_mut().poll_ready(cx) {
+            Poll::Ready(Ok(())) => {
+                if let Err(e) = this
+                    .stream
+                    .as_mut()
+                    .start_send(Message::Binary(buf.to_vec()))
+                {
+                    Poll::Ready(Err(ws_err_into(e)))
+                } else {
+                    Poll::Ready(Ok(buf.len()))
+                }
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))),
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+        }
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().stream.poll_flush(cx).map_err(ws_err_into)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().stream.poll_close(cx).map_err(ws_err_into)
+    }
+}
+
+impl AsyncRead for WebSocketRW {
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        if buf.remaining() == 0 {
+            return Poll::Ready(Ok(()));
+        }
+
+        let inner_buf = match self.as_mut().poll_fill_buf(cx) {
+            Poll::Ready(Ok(buf)) => buf,
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        };
+        let len = std::cmp::min(inner_buf.len(), buf.remaining());
+        buf.put_slice(&inner_buf[..len]);
+
+        self.consume(len);
+        Poll::Ready(Ok(()))
+    }
+}
+
+impl AsyncBufRead for WebSocketRW {
+    fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+        loop {
+            if self.as_mut().has_chunk() {
+                let buf = self.project().chunk.as_ref().unwrap().chunk();
+                return Poll::Ready(Ok(buf));
+            } else {
+                match self.as_mut().project().stream.poll_next(cx) {
+                    Poll::Ready(Some(Ok(message))) => match message {
+                        Message::Text(_) => {}
+                        Message::Binary(chunk) => {
+                            *self.as_mut().project().chunk = Some(Bytes::from(chunk));
+                        }
+                        Message::Ping(_) => {
+                            // No need to send a reply: tungstenite takes care of this for you.
+                        }
+                        Message::Pong(_) => {}
+                        Message::Close(_) => {
+                            // No need to send a reply: tungstenite takes care of this for you.
+                            return Poll::Ready(Ok(&[]));
+                        }
+                        Message::Frame(_) => {
+                            unreachable!();
+                        }
+                    },
+                    Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))),
+                    Poll::Ready(None) => return Poll::Ready(Ok(&[])),
+                    Poll::Pending => return Poll::Pending,
+                }
+            }
+        }
+    }
+
+    fn consume(self: Pin<&mut Self>, amt: usize) {
+        if amt > 0 {
+            self.project()
+                .chunk
+                .as_mut()
+                .expect("No chunk present")
+                .advance(amt);
+        }
+    }
+}
+
+async fn serve_websocket(
+    websocket: HyperWebsocket,
+    config: &ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    let websocket = websocket.await?;
+    handle_ws_client(
+        config,
+        cancel_map,
+        session_id,
+        WebSocketRW::new(websocket),
+        hostname,
+    )
+    .await?;
+    Ok(())
+}
+
+async fn ws_handler(
+    mut request: Request<Body>,
+    config: &'static ProxyConfig,
+    cancel_map: Arc<CancelMap>,
+    session_id: uuid::Uuid,
+) -> Result<Response<Body>, ApiError> {
+    let host = request
+        .headers()
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next())
+        .map(|s| s.to_string());
+
+    // Check if the request is a websocket upgrade request.
+    if hyper_tungstenite::is_upgrade_request(&request) {
+        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+            .map_err(|e| ApiError::BadRequest(e.into()))?;
+
+        tokio::spawn(async move {
+            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
+            {
+                error!("error in websocket connection: {:?}", e);
+            }
+        });
+
+        // Return the response so the spawned future can continue.
+        Ok(response)
+    } else {
+        json_response(StatusCode::OK, "Connect with a websocket client")
+    }
+}
+
+pub async fn task_main(
+    ws_listener: TcpListener,
+    config: &'static ProxyConfig,
+) -> anyhow::Result<()> {
+    scopeguard::defer! {
+        info!("websocket server has shut down");
+    }
+
+    let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
+    let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
+        Some(config) => config.into(),
+        None => {
+            warn!("TLS config is missing, WebSocket Secure server will not be started");
+            return Ok(());
+        }
+    };
+
+    let addr_incoming = AddrIncoming::from_listener(ws_listener)?;
+
+    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
+        if let Err(err) = conn {
+            error!("failed to accept TLS connection for websockets: {:?}", err);
+            ready(false)
+        } else {
+            ready(true)
+        }
+    });
+
+    let make_svc = hyper::service::make_service_fn(|_stream| async move {
+        Ok::<_, Infallible>(hyper::service::service_fn(
+            move |req: Request<Body>| async move {
+                let cancel_map = Arc::new(CancelMap::default());
+                let session_id = uuid::Uuid::new_v4();
+                ws_handler(req, config, cancel_map, session_id)
+                    .instrument(info_span!(
+                        "ws-client",
+                        session = format_args!("{session_id}")
+                    ))
+                    .await
+            },
+        ))
+    });
+
+    hyper::Server::builder(accept::from_stream(tls_listener))
+        .serve(make_svc)
+        .await?;
+
+    Ok(())
+}
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 89ea9142a9..aa6766c102 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -110,12 +110,23 @@ async fn main() -> anyhow::Result<()> {
     info!("Starting proxy on {proxy_address}");
     let proxy_listener = TcpListener::bind(proxy_address).await?;
 
-    let tasks = [
+    let mut tasks = vec![
         tokio::spawn(http::server::task_main(http_listener)),
         tokio::spawn(proxy::task_main(config, proxy_listener)),
         tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)),
-    ]
-    .map(flatten_err);
+    ];
+
+    if let Some(wss_address) = arg_matches.get_one::<String>("wss") {
+        let wss_address: SocketAddr = wss_address.parse()?;
+        info!("Starting wss on {}", wss_address);
+        let wss_listener = TcpListener::bind(wss_address).await?;
+        tasks.push(tokio::spawn(http::websocket::task_main(
+            wss_listener,
+            config,
+        )));
+    }
+
+    let tasks = tasks.into_iter().map(flatten_err);
 
     set_build_info_metric(GIT_VERSION);
     // This will block until all tasks have completed.
@@ -155,6 +166,11 @@ fn cli() -> clap::Command {
                 .help("listen for incoming http connections (metrics, etc) on ip:port")
                 .default_value("127.0.0.1:7001"),
         )
+        .arg(
+            Arg::new("wss")
+                .long("wss")
+                .help("listen for incoming wss connections on ip:port"),
+        )
         .arg(
             Arg::new("uri")
                 .short('u')
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 382f7cd918..63573d49c0 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -82,6 +82,47 @@ pub async fn task_main(
     }
 }
 
+pub async fn handle_ws_client(
+    config: &ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    stream: impl AsyncRead + AsyncWrite + Unpin + Send,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    // The `closed` counter will increase when this future is destroyed.
+    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
+    scopeguard::defer! {
+        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
+    }
+
+    let tls = config.tls_config.as_ref();
+    let hostname = hostname.as_deref();
+
+    // TLS is None here, because the connection is already encrypted.
+    let do_handshake = handshake(stream, None, cancel_map).instrument(info_span!("handshake"));
+    let (mut stream, params) = match do_handshake.await? {
+        Some(x) => x,
+        None => return Ok(()), // it's a cancellation request
+    };
+
+    // Extract credentials which we're going to use for auth.
+    let creds = {
+        let common_name = tls.and_then(|tls| tls.common_name.as_deref());
+        let result = config
+            .auth_backend
+            .as_ref()
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
+            .transpose();
+
+        async { result }.or_else(|e| stream.throw_error(e)).await?
+    };
+
+    let client = Client::new(stream, creds, &params, session_id);
+    cancel_map
+        .with_session(|session| client.connect_to_db(session))
+        .await
+}
+
 async fn handle_client(
     config: &ProxyConfig,
     cancel_map: &CancelMap,
@@ -108,7 +149,7 @@ async fn handle_client(
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?

From af9425394ffe0b0ee17908c9840525d43312ca31 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Jan 2023 17:15:56 +0200
Subject: [PATCH 119/132] Print time taken by CREATE/ALTER DATABASE at compute
 start.

Trying to investigate why the "apply_config" stage is taking longer
than expected. This proves or disproves that it's the CREATE DATABASE
statement.
---
 compute_tools/src/spec.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ce396f4527..81e01fe555 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,5 +1,6 @@
 use std::path::Path;
 use std::str::FromStr;
+use std::time::Instant;
 
 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
@@ -314,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
         // XXX: with a limited number of databases it is fine, but consider making it a HashMap
         let pg_db = existing_dbs.iter().find(|r| r.name == *name);
 
+        let start_time = Instant::now();
         if let Some(r) = pg_db {
             // XXX: db owner name is returned as quoted string from Postgres,
             // when quoting is needed.
@@ -332,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 info_print!(" -> update");
 
                 client.execute(query.as_str(), &[])?;
+                let elapsed = start_time.elapsed().as_millis();
+                info_print!(" ({} ms)", elapsed);
             }
         } else {
             let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
@@ -339,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
             query.push_str(&db.to_pg_options());
             client.execute(query.as_str(), &[])?;
+
+            let elapsed = start_time.elapsed().as_millis();
+            info_print!(" ({} ms)", elapsed);
         }
 
         info_print!("\n");

From 3526323bc470d763d70e85fe2a87f4269f17e5e0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Jan 2023 18:42:25 +0100
Subject: [PATCH 120/132] prepare Timeline::get_reconstruct_data for becoming
 async  (#3271)

This patch restructures the code so that PR
https://github.com/neondatabase/neon/pull/3228 can seamlessly
replace the return PageReconstructResult::NeedsDownload with
a download_remote_layer().await.

Background:

PR https://github.com/neondatabase/neon/pull/3228 will turn
get_reconstruct_data() async and do the on-demand
download right in place, instead of returning a
PageReconstructResult::NeedsDownload.

Current rustc requires that the layers lock guard be not in scope
across an await point.

For on-demand download inside get_reconstruct_data(), we need
to do download_remote_layer().await.

Supersedes https://github.com/neondatabase/neon/pull/3260

See my comment there:
https://github.com/neondatabase/neon/pull/3260#issuecomment-1370752407

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/tenant/layer_map.rs |   4 +-
 pageserver/src/tenant/timeline.rs  | 210 ++++++++++++++++++-----------
 2 files changed, 134 insertions(+), 80 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 4ff2d4b0d8..44bed5959f 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -260,8 +260,10 @@ where
     /// contain the version, even if it's missing from the returned
     /// layer.
     ///
+    /// NOTE: This only searches the 'historic' layers, *not* the
+    /// 'open' and 'frozen' layers!
+    ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
-        // linear search
         // Find the latest image layer that covers the given key
         let mut latest_img: Option<Arc<L>> = None;
         let mut latest_img_lsn: Option<Lsn> = None;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2c22c6694d..477108ec4c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1642,8 +1642,7 @@ impl Timeline {
 
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path =
-            Vec::<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>::new();
+        let mut traversal_path = Vec::<TraversalPathItem>::new();
 
         let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
             *cached_lsn
@@ -1708,82 +1707,132 @@ impl Timeline {
                 timeline_owned = ancestor;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
-                continue;
+                continue 'outer;
             }
 
-            let layers = timeline.layers.read().unwrap();
+            #[allow(unused_labels, clippy::never_loop)] // see comment at bottom of this loop
+            'layer_map_search: loop {
+                let remote_layer = {
+                    let layers = timeline.layers.read().unwrap();
 
-            // Check the open and frozen in-memory layers first, in order from newest
-            // to oldest.
-            if let Some(open_layer) = &layers.open_layer {
-                let start_lsn = open_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
-                    // Get all the data needed to reconstruct the page version from this layer.
-                    // But if we have an older cached page image, no need to go past that.
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = match open_layer.get_value_reconstruct_data(
-                        key,
-                        lsn_floor..cont_lsn,
-                        reconstruct_state,
-                    ) {
-                        Ok(result) => result,
-                        Err(e) => return PageReconstructResult::from(e),
-                    };
-                    cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, Box::new(open_layer.clone())));
-                    continue;
-                }
-            }
-            for frozen_layer in layers.frozen_layers.iter().rev() {
-                let start_lsn = frozen_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = match frozen_layer.get_value_reconstruct_data(
-                        key,
-                        lsn_floor..cont_lsn,
-                        reconstruct_state,
-                    ) {
-                        Ok(result) => result,
-                        Err(e) => return PageReconstructResult::from(e),
-                    };
-                    cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, Box::new(frozen_layer.clone())));
-                    continue 'outer;
-                }
-            }
+                    // Check the open and frozen in-memory layers first, in order from newest
+                    // to oldest.
+                    if let Some(open_layer) = &layers.open_layer {
+                        let start_lsn = open_layer.get_lsn_range().start;
+                        if cont_lsn > start_lsn {
+                            //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
+                            // Get all the data needed to reconstruct the page version from this layer.
+                            // But if we have an older cached page image, no need to go past that.
+                            let lsn_floor = max(cached_lsn + 1, start_lsn);
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let open_layer = Arc::clone(open_layer);
+                                    move || open_layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    }
+                    for frozen_layer in layers.frozen_layers.iter().rev() {
+                        let start_lsn = frozen_layer.get_lsn_range().start;
+                        if cont_lsn > start_lsn {
+                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
+                            let lsn_floor = max(cached_lsn + 1, start_lsn);
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let frozen_layer = Arc::clone(frozen_layer);
+                                    move || frozen_layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    }
 
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
-
-                // If it's a remote layer, the caller can do the download and retry.
-                if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) {
-                    info!("need remote layer {}", layer.traversal_id());
-                    return PageReconstructResult::NeedsDownload(
-                        Weak::clone(&timeline.myself),
-                        Arc::downgrade(&remote_layer),
-                    );
-                }
-
-                let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = match layer.get_value_reconstruct_data(
-                    key,
-                    lsn_floor..cont_lsn,
-                    reconstruct_state,
-                ) {
-                    Ok(result) => result,
-                    Err(e) => return PageReconstructResult::from(e),
+                    if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
+                        // If it's a remote layer, download it and retry.
+                        if let Some(remote_layer) =
+                            super::storage_layer::downcast_remote_layer(&layer)
+                        {
+                            // TODO: push a breadcrumb to 'traversal_path' to record the fact that
+                            // we downloaded / would need to download this layer.
+                            remote_layer // download happens outside the scope of `layers` guard object
+                        } else {
+                            // Get all the data needed to reconstruct the page version from this layer.
+                            // But if we have an older cached page image, no need to go past that.
+                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let layer = Arc::clone(&layer);
+                                    move || layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    } else if timeline.ancestor_timeline.is_some() {
+                        // Nothing on this timeline. Traverse to parent
+                        result = ValueReconstructResult::Continue;
+                        cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+                        continue 'outer;
+                    } else {
+                        // Nothing found
+                        result = ValueReconstructResult::Missing;
+                        continue 'outer;
+                    }
                 };
-                cont_lsn = lsn_floor;
-                traversal_path.push((result, cont_lsn, Box::new(layer.clone())));
-            } else if timeline.ancestor_timeline.is_some() {
-                // Nothing on this timeline. Traverse to parent
-                result = ValueReconstructResult::Continue;
-                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
-            } else {
-                // Nothing found
-                result = ValueReconstructResult::Missing;
+                // Indicate to the caller that we need remote_layer replaced with a downloaded
+                // layer in the layer map. The control flow could be a lot simpler, but the point
+                // of this commit is to prepare this function to
+                // 1. become async
+                // 2. do the download right here, using
+                //    ```
+                //    download_remote_layer().await?;
+                //    continue 'layer_map_search;
+                //    ```
+                // For (2), current rustc requires that the layers lock guard is not in scope.
+                // Hence, the complicated control flow.
+                let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
+                    Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
+                info!(
+                    "need remote layer {}",
+                    remote_layer_as_persistent.traversal_id()
+                );
+                return PageReconstructResult::NeedsDownload(
+                    Weak::clone(&timeline.myself),
+                    Arc::downgrade(&remote_layer),
+                );
             }
         }
     }
@@ -3358,22 +3407,25 @@ where
     }
 }
 
+type TraversalPathItem = (
+    ValueReconstructResult,
+    Lsn,
+    Box<dyn FnOnce() -> TraversalId>,
+);
+
 /// Helper function for get_reconstruct_data() to add the path of layers traversed
 /// to an error, as anyhow context information.
-fn layer_traversal_error(
-    msg: String,
-    path: Vec<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>,
-) -> PageReconstructResult<()> {
+fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructResult<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
     let mut msg_iter = path
-        .iter()
+        .into_iter()
         .map(|(r, c, l)| {
             format!(
                 "layer traversal: result {:?}, cont_lsn {}, layer: {}",
                 r,
                 c,
-                l.traversal_id(),
+                l(),
             )
         })
         .chain(std::iter::once(msg));

From 23d5e2bdaa6ca7142e54be0fd223ec2313d224bf Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Sat, 7 Jan 2023 00:46:42 +0200
Subject: [PATCH 121/132] Fix common pg port in the CLI basics test (#3283)

Closes https://github.com/neondatabase/neon/issues/3282
---
 test_runner/regress/test_neon_local_cli.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 49c063ce44..bd0f550ba5 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -1,17 +1,17 @@
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor
 
 
 # Test that neon cli is able to start and stop all processes with the user defaults.
 # Repeats the example from README.md as close as it can
-def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
+def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor):
     env = neon_env_builder.init_configs()
     # Skipping the init step that creates a local tenant in Pytest tests
     try:
         env.neon_cli.start()
         env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
-        env.neon_cli.pg_start(node_name="main")
+        env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port())
 
         env.neon_cli.create_branch(new_branch_name="migration_check")
-        env.neon_cli.pg_start(node_name="migration_check")
+        env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port())
     finally:
         env.neon_cli.stop()

From 7920b39a275a7bd13155d8726c92cd417bf7e2f9 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 9 Jan 2023 10:24:50 +0200
Subject: [PATCH 122/132] Adding transition reason to the log when a tenant is
 moved to Broken state  (#3289)

#3160
---
 pageserver/src/tenant.rs     | 16 ++++++++++------
 pageserver/src/tenant/mgr.rs |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 72404e98cd..71cdc6cf1c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -596,7 +596,7 @@ impl Tenant {
                 match tenant_clone.attach().await {
                     Ok(_) => {}
                     Err(e) => {
-                        tenant_clone.set_broken();
+                        tenant_clone.set_broken(&e.to_string());
                         error!("error attaching tenant: {:?}", e);
                     }
                 }
@@ -860,7 +860,7 @@ impl Tenant {
                 match tenant_clone.load().await {
                     Ok(()) => {}
                     Err(err) => {
-                        tenant_clone.set_broken();
+                        tenant_clone.set_broken(&err.to_string());
                         error!("could not load tenant {tenant_id}: {err:?}");
                     }
                 }
@@ -1496,7 +1496,7 @@ impl Tenant {
         });
     }
 
-    pub fn set_broken(&self) {
+    pub fn set_broken(&self, reason: &str) {
         self.state.send_modify(|current_state| {
             match *current_state {
                 TenantState::Active => {
@@ -1505,18 +1505,22 @@ impl Tenant {
                     // activated should never be marked as broken. We cope with it the best
                     // we can, but it shouldn't happen.
                     *current_state = TenantState::Broken;
-                    warn!("Changing Active tenant to Broken state");
+                    warn!("Changing Active tenant to Broken state, reason: {}", reason);
                 }
                 TenantState::Broken => {
                     // This shouldn't happen either
-                    warn!("Tenant is already broken");
+                    warn!("Tenant is already in Broken state");
                 }
                 TenantState::Stopping => {
                     // This shouldn't happen either
                     *current_state = TenantState::Broken;
-                    warn!("Marking Stopping tenant as Broken");
+                    warn!(
+                        "Marking Stopping tenant as Broken state, reason: {}",
+                        reason
+                    );
                 }
                 TenantState::Loading | TenantState::Attaching => {
+                    info!("Setting tenant as Broken state, reason: {}", reason);
                     *current_state = TenantState::Broken;
                 }
             }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 44849de735..af7794490a 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -430,7 +430,7 @@ where
         Err(e) => {
             let tenants_accessor = TENANTS.read().await;
             match tenants_accessor.get(&tenant_id) {
-                Some(tenant) => tenant.set_broken(),
+                Some(tenant) => tenant.set_broken(&e.to_string()),
                 None => warn!("Tenant {tenant_id} got removed from memory"),
             }
             Err(e)

From 93c77b0383add7f4faa1c8cb71490bc08ccc8526 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 9 Jan 2023 15:40:14 +0400
Subject: [PATCH 123/132] Use GHA environment for per-region deploy approvals
 on staging (#3293)

Each main deploy will wait for manual approval for each region
---
 .github/workflows/build_and_test.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2b0b0ba2bf..8123e3cbd4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -839,7 +839,9 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        target_region: [ us-east-2 ]
+        target_region: [ eu-west-1, us-east-2 ]
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1041,6 +1043,8 @@ jobs:
             target_cluster: dev-eu-west-1-zeta
             deploy_link_proxy: false
             deploy_legacy_scram_proxy: false
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1088,6 +1092,8 @@ jobs:
             target_cluster: dev-us-east-2-beta
           - target_region:  eu-west-1
             target_cluster: dev-eu-west-1-zeta
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3

From 3a22e1335d1a49f79a9d30d323ee2808246ad131 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 9 Jan 2023 14:15:53 +0200
Subject: [PATCH 124/132] Adding a PR template (#3288)

## Describe your changes
Added a PR template
## Issue ticket number and link
#3162
## Checklist before requesting a review
- [ ] I have performed a self-review of my code
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 .github/PULL_REQUEST_TEMPLATE/pull_request_template.md | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .github/PULL_REQUEST_TEMPLATE/pull_request_template.md

diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
new file mode 100644
index 0000000000..3f32b80ca8
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -0,0 +1,10 @@
+## Describe your changes
+
+## Issue ticket number and link
+
+## Checklist before requesting a review
+- [ ] I have performed a self-review of my code.
+- [ ] If it is a core feature, I have added thorough tests.
+- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
+- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
+

From a457256fef5819f288b3cf660b04f26d36587b36 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 9 Jan 2023 14:25:12 +0200
Subject: [PATCH 125/132] Fix log message matching (#3291)

Spotted
https://neon-github-public-dev.s3.amazonaws.com/reports/main/debug/3871991071/index.html#suites/158be07438eb5188d40b466b6acfaeb3/22966d740e33b677/
failing on `main`, fixes that by using a proper regex match string.

Also removes one clippy lint suppression.
---
 pageserver/src/tenant/timeline.rs     | 4 ++--
 test_runner/fixtures/neon_fixtures.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 477108ec4c..0d8a5fc800 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1710,8 +1710,8 @@ impl Timeline {
                 continue 'outer;
             }
 
-            #[allow(unused_labels, clippy::never_loop)] // see comment at bottom of this loop
-            'layer_map_search: loop {
+            #[allow(clippy::never_loop)] // see comment at bottom of this loop
+            '_layer_map_search: loop {
                 let remote_layer = {
                     let layers = timeline.layers.read().unwrap();
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bdd3dc004e..f284be8753 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1923,7 +1923,7 @@ class NeonPageserver(PgProtocol):
             ".*kill_and_wait_impl.*: wait successful.*",
             ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
-            ".*query handler for 'pagestream.*failed: Connection reset by peer (os error 104).*",  # pageserver notices compute shut down
+            ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
             ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",

From d4d0aa6ed6c9f0408723c923df01f7718d42b1b0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Jan 2023 18:44:51 +0100
Subject: [PATCH 126/132] gc_iteration_internal: better log message & debug log
 level if nothing to do

fixes https://github.com/neondatabase/neon/issues/3107
---
 pageserver/src/tenant.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 71cdc6cf1c..d74f263f08 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1874,7 +1874,12 @@ impl Tenant {
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
-        info!("starting on {} timelines", gc_timelines.len());
+        // If there is nothing to GC, we don't want any messages in the INFO log.
+        if !gc_timelines.is_empty() {
+            info!("{} timelines need GC", gc_timelines.len());
+        } else {
+            debug!("{} timelines need GC", gc_timelines.len());
+        }
 
         // Perform GC for each timeline.
         //

From 14df37c108f2accc11f6f0cd2c588a64ed48cdfa Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 9 Jan 2023 20:18:16 +0400
Subject: [PATCH 127/132] Use GHA environments for gradual prod rollout (#3295)

Each release will wait for manual approval for each region
---
 .github/workflows/build_and_test.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8123e3cbd4..1512c7b9aa 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,6 +794,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -913,6 +915,8 @@ jobs:
     strategy:
       matrix:
         target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -952,6 +956,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     env:
       KUBECONFIG: .kubeconfig
     steps:
@@ -995,6 +1001,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     env:
       KUBECONFIG: .kubeconfig
     steps:
@@ -1132,6 +1140,8 @@ jobs:
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1
             target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1171,6 +1181,8 @@ jobs:
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1
             target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3

From 8c07ef413ddac2f1bdfd37a078c343b5c4183c73 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Jan 2023 14:54:30 +0200
Subject: [PATCH 128/132] Minor cleanup of test_ondemand_download_timetravel
 test.

- Fix and improve comments
- Rename 'physical_size' local variable to 'resident_size' for clarity.
- Remove one 'unnecessary wait_for_upload' call. The
  'wait_for_sk_commit_lsn_to_reach_remote_storage' call after shutting
  down compute is sufficient.
---
 test_runner/regress/test_ondemand_download.py | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 352ae4b95c..184dc13888 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -120,7 +120,7 @@ def test_ondemand_download_large_rel(
 
 
 #
-# If you have a relation with a long history of updates,the pageserver downloads the layer
+# If you have a relation with a long history of updates, the pageserver downloads the layer
 # files containing the history as needed by timetravel queries.
 #
 @pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -189,13 +189,10 @@ def test_ondemand_download_timetravel(
         # run checkpoint manually to be sure that data landed in remote storage
         client.timeline_checkpoint(tenant_id, timeline_id)
 
-    # wait until pageserver successfully uploaded a checkpoint to remote storage
-    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
-    log.info("uploads have finished")
-
     ##### Stop the first pageserver instance, erase all its data
     env.postgres.stop_all()
 
+    # wait until pageserver has successfully uploaded all the data to remote storage
     wait_for_sk_commit_lsn_to_reach_remote_storage(
         tenant_id, timeline_id, env.safekeepers, env.pageserver
     )
@@ -227,11 +224,15 @@ def test_ondemand_download_timetravel(
 
     wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
 
-    # current_physical_size reports sum of layer file sizes, regardless of local or remote
+    # The current_physical_size reports the sum of layers loaded in the layer
+    # map, regardless of where the layer files are located. So even though we
+    # just removed the local files, they still count towards
+    # current_physical_size because they are loaded as `RemoteLayer`s.
     assert filled_current_physical == get_api_current_physical_size()
 
+    # Run queries at different points in time
     num_layers_downloaded = [0]
-    physical_size = [get_resident_physical_size()]
+    resident_size = [get_resident_physical_size()]
     for (checkpoint_number, lsn) in lsns:
         pg_old = env.postgres.create_start(
             branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn
@@ -268,13 +269,15 @@ def test_ondemand_download_timetravel(
         if len(num_layers_downloaded) > 4:
             assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4]
 
-        # Likewise, assert that the physical_size metric grows as layers are downloaded
-        physical_size.append(get_resident_physical_size())
-        log.info(f"physical_size[-1]={physical_size[-1]}")
-        if len(physical_size) > 4:
-            assert physical_size[-1] > physical_size[len(physical_size) - 4]
+        # Likewise, assert that the resident_physical_size metric grows as layers are downloaded
+        resident_size.append(get_resident_physical_size())
+        log.info(f"resident_size[-1]={resident_size[-1]}")
+        if len(resident_size) > 4:
+            assert resident_size[-1] > resident_size[len(resident_size) - 4]
 
-        # current_physical_size reports sum of layer file sizes, regardless of local or remote
+        # current_physical_size reports the total size of all layer files, whether
+        # they are present only in the remote storage, only locally, or both.
+        # It should not change.
         assert filled_current_physical == get_api_current_physical_size()
 
 

From 8eebd5f039f8bf59216f4830aa5a5178eb855e22 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 4 Jan 2023 14:50:13 +0100
Subject: [PATCH 129/132] run on-demand compaction in a task_mgr task

With this patch, tenant_detach and timeline_delete's
task_mgr::shutdown_tasks() call will wait for on-demand
compaction to finish.
Before this patch, the on-demand compaction would grab the
layer_removal_cs after tenant_detach / timeline_delete had
removed the timeline directory.
This resulted in error

  No such file or directory (os error 2)

NB: I already implemented this pattern for ondemand GC a while back.

fixes https://github.com/neondatabase/neon/issues/3136
---
 pageserver/src/http/routes.rs | 16 +++++------
 pageserver/src/tenant/mgr.rs  | 50 +++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 4f4c397abe..1c5eacd362 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -738,17 +738,17 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::NotFound)?;
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)?;
-    timeline
-        .compact()
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
         .await
+        .context("spawn compaction task")
         .map_err(ApiError::InternalServerError)?;
 
+    let result: anyhow::Result<()> = result_receiver
+        .await
+        .context("receive compaction result")
+        .map_err(ApiError::InternalServerError)?;
+    result.map_err(ApiError::InternalServerError)?;
+
     json_response(StatusCode::OK, ())
 }
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index af7794490a..dce7cd8bae 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -492,3 +492,53 @@ pub async fn immediate_gc(
 
     Ok(wait_task_done)
 }
+
+#[cfg(feature = "testing")]
+pub async fn immediate_compact(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
+    let guard = TENANTS.read().await;
+
+    let tenant = guard
+        .get(&tenant_id)
+        .map(Arc::clone)
+        .with_context(|| format!("Tenant {tenant_id} not found"))
+        .map_err(ApiError::NotFound)?;
+
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+
+    // Run in task_mgr to avoid race with detach operation
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        Some(timeline_id),
+        &format!(
+            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
+        ),
+        false,
+        async move {
+            let result = timeline
+                .compact()
+                .instrument(
+                    info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
+                )
+                .await;
+
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send compaction result: {result:?}"),
+            }
+            Ok(())
+        },
+    );
+
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);
+
+    Ok(wait_task_done)
+}

From 0807522a6433e9697ba9311bbe3a5a22f4ed1b59 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 9 Jan 2023 23:56:12 +0400
Subject: [PATCH 130/132] Enable wss proxy in all regions (#3292)

Follow-up to https://github.com/neondatabase/helm-charts/pull/24 and
#3247
---
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml    | 2 ++
 .../helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 2 ++
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml    | 2 ++
 .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml           | 2 ++
 .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml   | 2 ++
 .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml  | 2 ++
 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml    | 2 ++
 .github/helm-values/production.proxy-scram.yaml                 | 2 ++
 8 files changed, 16 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index ae9c1f2e40..08304503c5 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index a2f932e4fb..be0fc329c9 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index 1138536e94..b7f712585b 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index 4e4aff1f9e..e9e89aff7c 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 94290a87e1..5366ba4ae5 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 1a4023708b..e71e457f13 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 2942d6a2aa..9afe94edd1 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index c7143cd61a..8143f7e575 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -3,6 +3,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.cloud.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 podLabels:
   zenith_service: proxy-scram
@@ -16,6 +17,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
+  httpsPort: 443
 
 metrics:
   enabled: true

From 80d4afab0c78883a77bb927cc867677fc93b5a44 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Mon, 9 Jan 2023 22:28:23 +0100
Subject: [PATCH 131/132] Update tokio version (RUSTSEC-2023-0001)

---
 Cargo.lock                | 7 +++----
 workspace_hack/Cargo.toml | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 284a111ba7..1649e28faa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3731,9 +3731,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.21.1"
+version = "1.24.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95"
+checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
 dependencies = [
  "autocfg",
  "bytes",
@@ -3741,12 +3741,11 @@ dependencies = [
  "memchr",
  "mio",
  "num_cpus",
- "once_cell",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "winapi",
+ "windows-sys 0.42.0",
 ]
 
 [[package]]
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 989cc9202e..3aff839b81 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -40,7 +40,7 @@ scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
+tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
 tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
 tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
 tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] }

From 95bf19b85a06b27a7fc3118dee03d48648efab15 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 10 Jan 2023 14:05:27 +0400
Subject: [PATCH 132/132] Add --atomic to all helm upgrade operations (#3299)

When number of github actions workers is changed, some jobs get killed.
When helm if killed during the upgrade, release stuck in pending-upgrade
state. --atomic should initiate automatic rollback in this case.
---
 .github/workflows/build_and_test.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1512c7b9aa..1bbba8e3fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -983,8 +983,8 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker:
     name: deploy storage broker on old staging and old prod
@@ -1068,19 +1068,19 @@ jobs:
       - name: Re-deploy scram proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy link proxy
         if: matrix.deploy_link_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy legacy scram proxy
         if: matrix.deploy_legacy_scram_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker-dev-new:
     runs-on: [ self-hosted, dev, x64 ]
@@ -1157,7 +1157,7 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker-prod-new:
     runs-on: prod