From 7ba5c286b7c023e39162c6c6bcdad9353a3b5194 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 21 Apr 2023 11:10:48 +0200
Subject: [PATCH 01/31] [compute_ctl] Improve 'empty' compute startup sequence
 (#4034)

Do several attempts to get spec from the control-plane and retry network
errors and all reasonable HTTP response codes. Do not hang waiting for
spec without confirmation from the control-plane that compute is known
and is in the `Empty` state.

Adjust the way we track `total_startup_ms` metric, it should be
calculated since the moment we received spec, not from the moment
`compute_ctl` started. Also introduce a new `wait_for_spec_ms` metric
to track the time spent sleeping and waiting for spec to be delivered
from control-plane.

Part of neondatabase/cloud#3533
---
 compute_tools/src/bin/compute_ctl.rs     |  24 ++++-
 compute_tools/src/compute.rs             |   5 +-
 compute_tools/src/http/api.rs            |   1 +
 compute_tools/src/http/openapi_spec.yaml |  10 ++
 compute_tools/src/spec.rs                | 113 +++++++++++++++++++----
 libs/compute_api/src/responses.rs        |  14 +++
 6 files changed, 141 insertions(+), 26 deletions(-)
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 309310407d..36dbc382b5 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -73,7 +73,7 @@ fn main() -> Result<()> {
     // Try to use just 'postgres' if no path is provided
     let pgbin = matches.get_one::<String>("pgbin").unwrap();
 
-    let mut spec = None;
+    let spec;
     let mut live_config_allowed = false;
     match spec_json {
         // First, try to get cluster spec from the cli argument
@@ -89,9 +89,13 @@ fn main() -> Result<()> {
             } else if let Some(id) = compute_id {
                 if let Some(cp_base) = control_plane_uri {
                     live_config_allowed = true;
-                    if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
-                        spec = Some(s);
-                    }
+                    spec = match get_spec_from_control_plane(cp_base, id) {
+                        Ok(s) => s,
+                        Err(e) => {
+                            error!("cannot get response from control plane: {}", e);
+                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
+                        }
+                    };
                 } else {
                     panic!("must specify both --control-plane-uri and --compute-id or none");
                 }
@@ -114,7 +118,6 @@ fn main() -> Result<()> {
         spec_set = false;
     }
     let compute_node = ComputeNode {
-        start_time: Utc::now(),
         connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
@@ -147,6 +150,17 @@ fn main() -> Result<()> {
     let mut state = compute.state.lock().unwrap();
     let pspec = state.pspec.as_ref().expect("spec must be set");
     let startup_tracing_context = pspec.spec.startup_tracing_context.clone();
+
+    // Record for how long we slept waiting for the spec.
+    state.metrics.wait_for_spec_ms = Utc::now()
+        .signed_duration_since(state.start_time)
+        .to_std()
+        .unwrap()
+        .as_millis() as u64;
+    // Reset start time to the actual start of the configuration, so that
+    // total startup time was properly measured at the end.
+    state.start_time = Utc::now();
+
     state.status = ComputeStatus::Init;
     compute.state_changed.notify_all();
     drop(state);
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 51de2b6e0a..507dac9c0d 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -38,7 +38,6 @@ use crate::spec::*;
 
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
-    pub start_time: DateTime<Utc>,
     // Url type maintains proper escaping
     pub connstr: url::Url,
     pub pgdata: String,
@@ -66,6 +65,7 @@ pub struct ComputeNode {
 
 #[derive(Clone, Debug)]
 pub struct ComputeState {
+    pub start_time: DateTime<Utc>,
     pub status: ComputeStatus,
     /// Timestamp of the last Postgres activity
     pub last_active: DateTime<Utc>,
@@ -77,6 +77,7 @@ pub struct ComputeState {
 impl ComputeState {
     pub fn new() -> Self {
         Self {
+            start_time: Utc::now(),
             status: ComputeStatus::Empty,
             last_active: Utc::now(),
             error: None,
@@ -425,7 +426,7 @@ impl ComputeNode {
                 .unwrap()
                 .as_millis() as u64;
             state.metrics.total_startup_ms = startup_end_time
-                .signed_duration_since(self.start_time)
+                .signed_duration_since(compute_state.start_time)
                 .to_std()
                 .unwrap()
                 .as_millis() as u64;
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 3ca688de69..4468f6f5e4 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -18,6 +18,7 @@ use tracing_utils::http::OtelName;
 
 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
     ComputeStatusResponse {
+        start_time: state.start_time,
         tenant: state
             .pspec
             .as_ref()
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index bdb09d4a6b..cc8f074a50 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -152,11 +152,14 @@ components:
       type: object
       description: Compute startup metrics.
       required:
+        - wait_for_spec_ms
         - sync_safekeepers_ms
         - basebackup_ms
         - config_ms
         - total_startup_ms
       properties:
+        wait_for_spec_ms:
+          type: integer
         sync_safekeepers_ms:
           type: integer
         basebackup_ms:
@@ -181,6 +184,13 @@ components:
         - status
         - last_active
       properties:
+        start_time:
+          type: string
+          description: |
+            Time when compute was started. If initially compute was started in the `empty`
+            state and then provided with valid spec, `start_time` will be reset to the
+            moment, when spec was received.
+          example: "2022-10-12T07:20:50.52Z"
         status:
           $ref: '#/components/schemas/ComputeStatus'
         last_active:
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 088f74335a..28e0ef41b7 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -4,42 +4,117 @@ use std::str::FromStr;
 use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
-use tracing::{info, info_span, instrument, span_enabled, warn, Level};
+use reqwest::StatusCode;
+use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
 
 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
-use compute_api::responses::ControlPlaneSpecResponse;
+use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
 use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
 
+// Do control plane request and return response if any. In case of error it
+// returns a bool flag indicating whether it makes sense to retry the request
+// and a string with error message.
+fn do_control_plane_request(
+    uri: &str,
+    jwt: &str,
+) -> Result<ControlPlaneSpecResponse, (bool, String)> {
+    let resp = reqwest::blocking::Client::new()
+        .get(uri)
+        .header("Authorization", jwt)
+        .send()
+        .map_err(|e| {
+            (
+                true,
+                format!("could not perform spec request to control plane: {}", e),
+            )
+        })?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+            Ok(spec_resp) => Ok(spec_resp),
+            Err(e) => Err((
+                true,
+                format!("could not deserialize control plane response: {}", e),
+            )),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => {
+            Err((true, "control plane is temporarily unavailable".to_string()))
+        }
+        StatusCode::BAD_GATEWAY => {
+            // We have a problem with intermittent 502 errors now
+            // https://github.com/neondatabase/cloud/issues/2353
+            // It's fine to retry GET request in this case.
+            Err((true, "control plane request failed with 502".to_string()))
+        }
+        // Another code, likely 500 or 404, means that compute is unknown to the control plane
+        // or some internal failure happened. Doesn't make much sense to retry in this case.
+        _ => Err((
+            false,
+            format!(
+                "unexpected control plane response status code: {}",
+                resp.status()
+            ),
+        )),
+    }
+}
+
 /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
 /// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
+pub fn get_spec_from_control_plane(
+    base_uri: &str,
+    compute_id: &str,
+) -> Result<Option<ComputeSpec>> {
     let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
+    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
         Ok(v) => v,
         Err(_) => "".to_string(),
     };
+    let mut attempt = 1;
+    let mut spec: Result<Option<ComputeSpec>> = Ok(None);
+
     info!("getting spec from control plane: {}", cp_uri);
 
-    // TODO: check the response. We should distinguish cases when it's
-    // - network error, then retry
-    // - no spec for compute yet, then wait
-    // - compute id is unknown or any other error, then bail out
-    let resp: ControlPlaneSpecResponse = reqwest::blocking::Client::new()
-        .get(cp_uri)
-        .header("Authorization", jwt)
-        .send()
-        .map_err(|e| anyhow!("could not send spec request to control plane: {}", e))?
-        .json()
-        .map_err(|e| anyhow!("could not get compute spec from control plane: {}", e))?;
+    // Do 3 attempts to get spec from the control plane using the following logic:
+    // - network error -> then retry
+    // - compute id is unknown or any other error -> bail out
+    // - no spec for compute yet (Empty state) -> return Ok(None)
+    // - got spec -> return Ok(Some(spec))
+    while attempt < 4 {
+        spec = match do_control_plane_request(&cp_uri, &jwt) {
+            Ok(spec_resp) => match spec_resp.status {
+                ControlPlaneComputeStatus::Empty => Ok(None),
+                ControlPlaneComputeStatus::Attached => {
+                    if let Some(spec) = spec_resp.spec {
+                        Ok(Some(spec))
+                    } else {
+                        bail!("compute is attached, but spec is empty")
+                    }
+                }
+            },
+            Err((retry, msg)) => {
+                if retry {
+                    Err(anyhow!(msg))
+                } else {
+                    bail!(msg);
+                }
+            }
+        };
 
-    if let Some(spec) = resp.spec {
-        Ok(spec)
-    } else {
-        bail!("could not get compute spec from control plane")
+        if let Err(e) = &spec {
+            error!("attempt {} to get spec failed with: {}", attempt, e);
+        } else {
+            return spec;
+        }
+
+        attempt += 1;
+        std::thread::sleep(std::time::Duration::from_millis(100));
     }
+
+    // All attempts failed, return error.
+    spec
 }
 
 /// It takes cluster specification and does the following:
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 370b2c5626..c409563b56 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,6 +14,7 @@ pub struct GenericAPIError {
 #[derive(Serialize, Debug)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
+    pub start_time: DateTime<Utc>,
     pub tenant: Option<String>,
     pub timeline: Option<String>,
     pub status: ComputeStatus,
@@ -63,6 +64,7 @@ where
 /// Response of the /metrics.json API
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
+    pub wait_for_spec_ms: u64,
     pub sync_safekeepers_ms: u64,
     pub basebackup_ms: u64,
     pub config_ms: u64,
@@ -75,4 +77,16 @@ pub struct ComputeMetrics {
 #[derive(Deserialize, Debug)]
 pub struct ControlPlaneSpecResponse {
     pub spec: Option<ComputeSpec>,
+    pub status: ControlPlaneComputeStatus,
+}
+
+#[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ControlPlaneComputeStatus {
+    // Compute is known to control-plane, but it's not
+    // yet attached to any timeline / endpoint.
+    Empty,
+    // Compute is attached to some timeline / endpoint and
+    // should be able to start with provided spec.
+    Attached,
 }

From afbbc6103612819058db63dc24829ec5eccccef7 Mon Sep 17 00:00:00 2001
From: Eduard Dyckman <bird.duskpoet@gmail.com>
Date: Mon, 24 Apr 2023 22:19:25 +0900
Subject: [PATCH 02/31] Adding synthetic size to pageserver swagger (#4049)

## Describe your changes

I added synthetic size response to the console swagger. Now I am syncing
it back to neon
---
 pageserver/src/http/openapi_spec.yml | 115 +++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b0e4e1ca85..95f6e96a5b 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -520,6 +520,43 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
 
+  /v1/tenant/{tenant_id}/synthetic_size:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: |
+        Calculate tenant's synthetic size
+      responses:
+        "200":
+          description: Tenant's synthetic size
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
   /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
@@ -948,6 +985,84 @@ components:
         latest_gc_cutoff_lsn:
           type: string
           format: hex
+
+    SyntheticSizeResponse:
+      type: object
+      required:
+        - id
+        - size
+        - segment_sizes
+        - inputs
+      properties:
+        id:
+          type: string
+          format: hex
+        size:
+          type: integer
+        segment_sizes:
+          type: array
+          items:
+            $ref: "#/components/schemas/SegmentSize"
+        inputs:
+          type: object
+          properties:
+            segments:
+              type: array
+              items:
+                $ref: "#/components/schemas/SegmentData"
+            timeline_inputs:
+              type: array
+              items:
+                $ref: "#/components/schemas/TimelineInput"
+
+    SegmentSize:
+      type: object
+      required:
+        - method
+        - accum_size
+      properties:
+        method:
+          type: string
+        accum_size:
+          type: integer
+
+    SegmentData:
+      type: object
+      required:
+        - segment
+      properties:
+        segment:
+          type: object
+          required:
+            - lsn
+          properties:
+            parent:
+              type: integer
+            lsn:
+              type: integer
+            size:
+              type: integer
+            needed:
+              type: boolean
+        timeline_id:
+          type: string
+          format: hex
+        kind:
+          type: string
+
+    TimelineInput:
+      type: object
+      required:
+        - timeline_id
+      properties:
+        ancestor_id:
+          type: string
+        ancestor_lsn:
+          type: string
+        timeline_id:
+          type: string
+          format: hex
+
     Error:
       type: object
       required:

From e83684b8683847a5f467809cd7dd8e2ccdc9bffa Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 25 Apr 2023 14:10:18 +0200
Subject: [PATCH 03/31] add libmetric metric for each logged log message
 (#4055)

This patch extends the libmetrics logging setup functionality with a
`tracing` layer that increments a Prometheus counter each time we log a
log message. We have the counter per tracing event level. This allows
for monitoring WARN and ERR log volume without parsing the log. Also, it
would allow cross-checking whether logs got dropped on the way into
Loki.

It would be nicer if we could hook deeper into the tracing logging
layer, to avoid evaluating the filter twice.
But I don't know how to do it.
---
 libs/utils/src/logging.rs               | 100 ++++++++++++++++++++----
 pageserver/src/http/routes.rs           |  34 ++++++++
 test_runner/fixtures/metrics.py         |   1 +
 test_runner/fixtures/pageserver/http.py |  10 +++
 test_runner/regress/test_logging.py     |  49 ++++++++++++
 5 files changed, 179 insertions(+), 15 deletions(-)
 create mode 100644 test_runner/regress/test_logging.py

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index f770622a60..ed856b6804 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,6 +1,7 @@
 use std::str::FromStr;
 
 use anyhow::Context;
+use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};
 
 #[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
@@ -23,25 +24,64 @@ impl LogFormat {
     }
 }
 
-pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
-    let default_filter_str = "info";
+static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+    metrics::register_int_counter_vec!(
+        "libmetrics_tracing_event_count",
+        "Number of tracing events, by level",
+        &["level"]
+    )
+    .expect("failed to define metric")
+});
 
+struct TracingEventCountLayer(&'static metrics::IntCounterVec);
+
+impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
+where
+    S: tracing::Subscriber,
+{
+    fn on_event(
+        &self,
+        event: &tracing::Event<'_>,
+        _ctx: tracing_subscriber::layer::Context<'_, S>,
+    ) {
+        let level = event.metadata().level();
+        let level = match *level {
+            tracing::Level::ERROR => "error",
+            tracing::Level::WARN => "warn",
+            tracing::Level::INFO => "info",
+            tracing::Level::DEBUG => "debug",
+            tracing::Level::TRACE => "trace",
+        };
+        self.0.with_label_values(&[level]).inc();
+    }
+}
+
+pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     // We fall back to printing all spans at info-level or above if
     // the RUST_LOG environment variable is not set.
-    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
+    let rust_log_env_filter = || {
+        tracing_subscriber::EnvFilter::try_from_default_env()
+            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
+    };
 
-    let base_logger = tracing_subscriber::fmt()
-        .with_env_filter(env_filter)
-        .with_target(false)
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_writer(std::io::stdout);
-
-    match log_format {
-        LogFormat::Json => base_logger.json().init(),
-        LogFormat::Plain => base_logger.init(),
-        LogFormat::Test => base_logger.with_test_writer().init(),
-    }
+    // NB: the order of the with() calls does not matter.
+    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+    use tracing_subscriber::prelude::*;
+    tracing_subscriber::registry()
+        .with({
+            let log_layer = tracing_subscriber::fmt::layer()
+                .with_target(false)
+                .with_ansi(atty::is(atty::Stream::Stdout))
+                .with_writer(std::io::stdout);
+            let log_layer = match log_format {
+                LogFormat::Json => log_layer.json().boxed(),
+                LogFormat::Plain => log_layer.boxed(),
+                LogFormat::Test => log_layer.with_test_writer().boxed(),
+            };
+            log_layer.with_filter(rust_log_env_filter())
+        })
+        .with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()))
+        .init();
 
     Ok(())
 }
@@ -157,3 +197,33 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
         <Self as std::fmt::Display>::fmt(self, f)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use metrics::{core::Opts, IntCounterVec};
+
+    use super::TracingEventCountLayer;
+
+    #[test]
+    fn tracing_event_count_metric() {
+        let counter_vec =
+            IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
+        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
+        let layer = TracingEventCountLayer(counter_vec);
+        use tracing_subscriber::prelude::*;
+
+        tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
+            tracing::trace!("foo");
+            tracing::debug!("foo");
+            tracing::info!("foo");
+            tracing::warn!("foo");
+            tracing::error!("foo");
+        });
+
+        assert_eq!(counter_vec.with_label_values(&["trace"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["debug"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["info"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["warn"]).get(), 1);
+        assert_eq!(counter_vec.with_label_values(&["error"]).get(), 1);
+    }
+}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 06a97f6dff..3318e5263c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1201,6 +1201,36 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     )
 }
 
+async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    #[derive(Debug, serde::Deserialize)]
+    #[serde(rename_all = "lowercase")]
+    enum Level {
+        Error,
+        Warn,
+        Info,
+        Debug,
+        Trace,
+    }
+    #[derive(Debug, serde::Deserialize)]
+    struct Request {
+        level: Level,
+        message: String,
+    }
+    let body: Request = json_request(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+
+    match body.level {
+        Level::Error => tracing::error!(?body.message),
+        Level::Warn => tracing::warn!(?body.message),
+        Level::Info => tracing::info!(?body.message),
+        Level::Debug => tracing::debug!(?body.message),
+        Level::Trace => tracing::trace!(?body.message),
+    }
+
+    json_response(StatusCode::OK, ())
+}
+
 pub fn make_router(
     conf: &'static PageServerConf,
     launch_ts: &'static LaunchTimestamp,
@@ -1341,5 +1371,9 @@ pub fn make_router(
             testing_api!("set tenant state to broken", handle_tenant_break),
         )
         .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
+        .post(
+            "/v1/tracing/event",
+            testing_api!("emit a tracing event", post_tracing_event_handler),
+        )
         .any(handler_404))
 }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 2984f2c7d3..c88b985c8e 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -53,6 +53,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_storage_operations_seconds_global_bucket",
     "libmetrics_launch_timestamp",
     "libmetrics_build_info",
+    "libmetrics_tracing_event_count_total",
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 69042478c7..cf92aeb6c0 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -550,3 +550,13 @@ class PageserverHttpClient(requests.Session):
     def tenant_break(self, tenant_id: TenantId):
         res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
         self.verbose_error(res)
+
+    def post_tracing_event(self, level: str, message: str):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tracing/event",
+            json={
+                "level": level,
+                "message": message,
+            },
+        )
+        self.verbose_error(res)
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
new file mode 100644
index 0000000000..d559be0a8f
--- /dev/null
+++ b/test_runner/regress/test_logging.py
@@ -0,0 +1,49 @@
+import uuid
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import wait_until
+
+
+@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
+def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
+    # self-test: make sure the event is logged (i.e., our testing endpoint works)
+    log_expected = {
+        "trace": False,
+        "debug": False,
+        "info": True,
+        "warn": True,
+        "error": True,
+    }[level]
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    msg_id = uuid.uuid4().hex
+
+    # NB: the _total suffix is added by our prometheus client
+    before = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level})
+
+    # post the event
+    ps_http.post_tracing_event(level, msg_id)
+    if log_expected:
+        env.pageserver.allowed_errors.append(f".*{msg_id}.*")
+
+    def assert_logged():
+        if not log_expected:
+            return
+        assert env.pageserver.log_contains(f".*{msg_id}.*")
+
+    wait_until(10, 0.5, assert_logged)
+
+    # make sure it's counted
+    def assert_metric_value():
+        if not log_expected:
+            return
+        # NB: the _total suffix is added by our prometheus client
+        val = ps_http.get_metric_value("libmetrics_tracing_event_count_total", {"level": level})
+        val = val or 0.0
+        log.info("libmetrics_tracing_event_count: %s", val)
+        assert val > (before or 0.0)
+
+    wait_until(10, 1, assert_metric_value)

From 4911d7ce6f6ab1f89ac1b026add8514e8e84979d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 25 Apr 2023 15:22:23 +0300
Subject: [PATCH 04/31] feat: warn when requests get cancelled (#4064)

Add a simple disarmable dropguard to log if request is cancelled before
it is completed. We currently don't have this, and it makes for
difficult to know when the request was dropped.
---
 libs/utils/src/http/endpoint.rs | 39 ++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 616f2b8468..b11aef9892 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -76,6 +76,7 @@ where
 
         let log_quietly = method == Method::GET;
         async move {
+            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
             if log_quietly {
                 debug!("Handling request");
             } else {
@@ -87,7 +88,11 @@ where
             // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
             //
             // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
-            match (self.0)(request).await {
+            let res = (self.0)(request).await;
+
+            cancellation_guard.disarm();
+
+            match res {
                 Ok(response) => {
                     let response_status = response.status();
                     if log_quietly && response_status.is_success() {
@@ -105,6 +110,38 @@ where
     }
 }
 
+/// Drop guard to WARN in case the request was dropped before completion.
+struct RequestCancelled {
+    warn: Option<tracing::Span>,
+}
+
+impl RequestCancelled {
+    /// Create the drop guard using the [`tracing::Span::current`] as the span.
+    fn warn_when_dropped_without_responding() -> Self {
+        RequestCancelled {
+            warn: Some(tracing::Span::current()),
+        }
+    }
+
+    /// Consume the drop guard without logging anything.
+    fn disarm(mut self) {
+        self.warn = None;
+    }
+}
+
+impl Drop for RequestCancelled {
+    fn drop(&mut self) {
+        if let Some(span) = self.warn.take() {
+            // the span has all of the info already, but the outer `.instrument(span)` has already
+            // been dropped, so we need to manually re-enter it for this message.
+            //
+            // this is what the instrument would do before polling so it is fine.
+            let _g = span.entered();
+            warn!("request was dropped before completing");
+        }
+    }
+}
+
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
     SERVE_METRICS_COUNT.inc();
 

From fa20e3757432a0b900f33a89441f7fee02fc06c9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 25 Apr 2023 14:22:48 +0200
Subject: [PATCH 05/31] add gauge for in-flight layer uploads (#3951)

For the "worst-case /storage usage panel", we need to compute
```
remote size + local-only size
```

We currently don't have a metric for local-only layers.

The number of in-flight layers in the upload queue is just that, so, let
Prometheus scrape it.

The metric is two counters (started and finished).
The delta is the amount of in-flight uploads in the queue.

The metrics are incremented in the respective `call_unfinished_metric_*`
functions.
These track ongoing operations by file_kind and op_kind.
We only need this metric for layer uploads, so, there's the new
RemoteTimelineClientMetricsCallTrackSize type that forces all call sites
to decide whether they want the size tracked or not.
If we find that other file_kinds or op_kinds are interesting (metadata
uploads, layer downloads, layer deletes) are interesting, we can just
enable them, and they'll be just another label combination within the
metrics that this PR adds.

fixes https://github.com/neondatabase/neon/issues/3922
---
 pageserver/src/metrics.rs                     | 195 +++++++++++-
 .../src/tenant/remote_timeline_client.rs      | 285 ++++++++++++++----
 test_runner/fixtures/metrics.py               |   2 +
 3 files changed, 405 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c075315683..cf60a1a404 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -385,6 +385,26 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
     .expect("failed to define a metric")
 });
 
+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_started",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation is scheduled.",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
+static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_timeline_client_bytes_finished",
+        "Incremented by the number of bytes associated with a remote timeline client operation. \
+         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -739,6 +759,8 @@ pub struct RemoteTimelineClientMetrics {
     remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
     calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
     calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
+    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
 
 impl RemoteTimelineClientMetrics {
@@ -749,6 +771,8 @@ impl RemoteTimelineClientMetrics {
             remote_operation_time: Mutex::new(HashMap::default()),
             calls_unfinished_gauge: Mutex::new(HashMap::default()),
             calls_started_hist: Mutex::new(HashMap::default()),
+            bytes_started_counter: Mutex::new(HashMap::default()),
+            bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge: Mutex::new(None),
         }
     }
@@ -787,6 +811,7 @@ impl RemoteTimelineClientMetrics {
         });
         metric.clone()
     }
+
     fn calls_unfinished_gauge(
         &self,
         file_kind: &RemoteOpFileKind,
@@ -828,32 +853,125 @@ impl RemoteTimelineClientMetrics {
         });
         metric.clone()
     }
+
+    fn bytes_started_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+
+    fn bytes_finished_counter(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> IntCounter {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+}
+
+#[cfg(test)]
+impl RemoteTimelineClientMetrics {
+    pub fn get_bytes_started_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_started_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
+
+    pub fn get_bytes_finished_counter_value(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Option<u64> {
+        let guard = self.bytes_finished_counter.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        guard.get(&key).map(|counter| counter.get())
+    }
 }
 
 /// See [`RemoteTimelineClientMetrics::call_begin`].
 #[must_use]
-pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
+pub(crate) struct RemoteTimelineClientCallMetricGuard {
+    /// Decremented on drop.
+    calls_unfinished_metric: Option<IntGauge>,
+    /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
+    bytes_finished: Option<(IntCounter, u64)>,
+}
 
 impl RemoteTimelineClientCallMetricGuard {
-    /// Consume this guard object without decrementing the metric.
-    /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
+    /// Consume this guard object without performing the metric updates it would do on `drop()`.
+    /// The caller vouches to do the metric updates manually.
     pub fn will_decrement_manually(mut self) {
-        self.0 = None; // prevent drop() from decrementing
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = &mut self;
+        calls_unfinished_metric.take();
+        bytes_finished.take();
     }
 }
 
 impl Drop for RemoteTimelineClientCallMetricGuard {
     fn drop(&mut self) {
-        if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
+        let RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric,
+            bytes_finished,
+        } = self;
+        if let Some(guard) = calls_unfinished_metric.take() {
             guard.dec();
         }
+        if let Some((bytes_finished_metric, value)) = bytes_finished {
+            bytes_finished_metric.inc_by(*value);
+        }
     }
 }
 
+/// The enum variants communicate to the [`RemoteTimelineClientMetrics`] whether to
+/// track the byte size of this call in applicable metric(s).
+pub(crate) enum RemoteTimelineClientMetricsCallTrackSize {
+    /// Do not account for this call's byte size in any metrics.
+    /// The `reason` field is there to make the call sites self-documenting
+    /// about why they don't need the metric.
+    DontTrackSize { reason: &'static str },
+    /// Track the byte size of the call in applicable metric(s).
+    Bytes(u64),
+}
+
 impl RemoteTimelineClientMetrics {
-    /// Increment the metrics that track ongoing calls to the remote timeline client instance.
+    /// Update the metrics that change when a call to the remote timeline client instance starts.
     ///
-    /// Drop the returned guard object once the operation is finished to decrement the values.
+    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
     /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
     /// is more suitable.
     /// Never do both.
@@ -861,24 +979,51 @@ impl RemoteTimelineClientMetrics {
         &self,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
     ) -> RemoteTimelineClientCallMetricGuard {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
         self.calls_started_hist(file_kind, op_kind)
-            .observe(unfinished_metric.get() as f64);
-        unfinished_metric.inc();
-        RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
+            .observe(calls_unfinished_metric.get() as f64);
+        calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
+
+        let bytes_finished = match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
+                // nothing to do
+                None
+            }
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_started_counter(file_kind, op_kind).inc_by(size);
+                let finished_counter = self.bytes_finished_counter(file_kind, op_kind);
+                Some((finished_counter, size))
+            }
+        };
+        RemoteTimelineClientCallMetricGuard {
+            calls_unfinished_metric: Some(calls_unfinished_metric),
+            bytes_finished,
+        }
     }
 
-    /// Manually decrement the metric instead of using the guard object.
+    /// Manually udpate the metrics that track completions, instead of using the guard object.
     /// Using the guard object is generally preferable.
     /// See [`call_begin`] for more context.
-    pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
-        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+    pub(crate) fn call_end(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+        size: RemoteTimelineClientMetricsCallTrackSize,
+    ) {
+        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
         debug_assert!(
-            unfinished_metric.get() > 0,
+            calls_unfinished_metric.get() > 0,
             "begin and end should cancel out"
         );
-        unfinished_metric.dec();
+        calls_unfinished_metric.dec();
+        match size {
+            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
+            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
+                self.bytes_finished_counter(file_kind, op_kind).inc_by(size);
+            }
+        }
     }
 }
 
@@ -891,6 +1036,8 @@ impl Drop for RemoteTimelineClientMetrics {
             remote_operation_time,
             calls_unfinished_gauge,
             calls_started_hist,
+            bytes_started_counter,
+            bytes_finished_counter,
         } = self;
         for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
             let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
@@ -911,6 +1058,22 @@ impl Drop for RemoteTimelineClientMetrics {
                 b,
             ]);
         }
+        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
         {
             let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
             let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 28c4943dbd..c42824a8b5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -219,7 +219,8 @@ use utils::lsn::Lsn;
 
 use crate::metrics::{
     MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
-    REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
+    REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
@@ -367,9 +368,13 @@ impl RemoteTimelineClient {
 
     /// Download index file
     pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
-        let _unfinished_gauge_guard = self
-            .metrics
-            .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
+        let _unfinished_gauge_guard = self.metrics.call_begin(
+            &RemoteOpFileKind::Index,
+            &RemoteOpKind::Download,
+            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                reason: "no need for a downloads gauge",
+            },
+        );
 
         download::download_index_part(
             self.conf,
@@ -398,9 +403,13 @@ impl RemoteTimelineClient {
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
         let downloaded_size = {
-            let _unfinished_gauge_guard = self
-                .metrics
-                .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
+            let _unfinished_gauge_guard = self.metrics.call_begin(
+                &RemoteOpFileKind::Layer,
+                &RemoteOpKind::Download,
+                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                    reason: "no need for a downloads gauge",
+                },
+            );
             download::download_layer_file(
                 self.conf,
                 &self.storage_impl,
@@ -886,11 +895,32 @@ impl RemoteTimelineClient {
     fn calls_unfinished_metric_impl(
         &self,
         op: &UploadOp,
-    ) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
+    ) -> Option<(
+        RemoteOpFileKind,
+        RemoteOpKind,
+        RemoteTimelineClientMetricsCallTrackSize,
+    )> {
+        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
         let res = match op {
-            UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
-            UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
-            UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
+            UploadOp::UploadLayer(_, m) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Upload,
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+            ),
+            UploadOp::UploadMetadata(_, _) => (
+                RemoteOpFileKind::Index,
+                RemoteOpKind::Upload,
+                DontTrackSize {
+                    reason: "metadata uploads are tiny",
+                },
+            ),
+            UploadOp::Delete(file_kind, _) => (
+                *file_kind,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
             UploadOp::Barrier(_) => {
                 // we do not account these
                 return None;
@@ -900,20 +930,20 @@ impl RemoteTimelineClient {
     }
 
     fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
             Some(x) => x,
             None => return,
         };
-        let guard = self.metrics.call_begin(&file_kind, &op_kind);
+        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
         guard.will_decrement_manually(); // in unfinished_ops_metric_end()
     }
 
     fn calls_unfinished_metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
             Some(x) => x,
             None => return,
         };
-        self.metrics.call_end(&file_kind, &op_kind);
+        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
     }
 
     fn stop(&self) {
@@ -981,11 +1011,19 @@ impl RemoteTimelineClient {
 mod tests {
     use super::*;
     use crate::{
-        tenant::harness::{TenantHarness, TIMELINE_ID},
+        context::RequestContext,
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            Tenant,
+        },
         DEFAULT_PG_VERSION,
     };
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use std::{collections::HashSet, path::Path};
+    use std::{
+        collections::HashSet,
+        path::{Path, PathBuf},
+    };
+    use tokio::runtime::EnterGuard;
     use utils::lsn::Lsn;
 
     pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1034,39 +1072,80 @@ mod tests {
         assert_eq!(found, expected);
     }
 
+    struct TestSetup {
+        runtime: &'static tokio::runtime::Runtime,
+        entered_runtime: EnterGuard<'static>,
+        harness: TenantHarness<'static>,
+        tenant: Arc<Tenant>,
+        tenant_ctx: RequestContext,
+        remote_fs_dir: PathBuf,
+        client: Arc<RemoteTimelineClient>,
+    }
+
+    impl TestSetup {
+        fn new(test_name: &str) -> anyhow::Result<Self> {
+            // Use a current-thread runtime in the test
+            let runtime = Box::leak(Box::new(
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()?,
+            ));
+            let entered_runtime = runtime.enter();
+
+            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
+            let harness = TenantHarness::create(test_name)?;
+            let (tenant, ctx) = runtime.block_on(harness.load());
+            // create an empty timeline directory
+            let timeline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let _ = timeline.initialize(&ctx).unwrap();
+
+            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
+            std::fs::create_dir_all(remote_fs_dir)?;
+            let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+
+            let storage_config = RemoteStorageConfig {
+                max_concurrent_syncs: std::num::NonZeroUsize::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                )
+                .unwrap(),
+                max_sync_errors: std::num::NonZeroU32::new(
+                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                )
+                .unwrap(),
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            };
+
+            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
+
+            let client = Arc::new(RemoteTimelineClient {
+                conf: harness.conf,
+                runtime,
+                tenant_id: harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                storage_impl: storage,
+                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &harness.tenant_id,
+                    &TIMELINE_ID,
+                )),
+            });
+
+            Ok(Self {
+                runtime,
+                entered_runtime,
+                harness,
+                tenant,
+                tenant_ctx: ctx,
+                remote_fs_dir,
+                client,
+            })
+        }
+    }
+
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
-        let harness = TenantHarness::create("upload_scheduling")?;
-        let (tenant, ctx) = runtime.block_on(harness.load());
-        let _timeline =
-            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-
-        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-
-        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-        };
-
         // Test outline:
         //
         // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1081,21 +1160,19 @@ mod tests {
         // Schedule another deletion. Check that it's launched immediately.
         // Schedule index upload. Check that it's queued
 
-        println!("workdir: {}", harness.conf.workdir.display());
-
-        let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
-        let client = Arc::new(RemoteTimelineClient {
-            conf: harness.conf,
+        let TestSetup {
             runtime,
-            tenant_id: harness.tenant_id,
-            timeline_id: TIMELINE_ID,
-            storage_impl,
-            upload_queue: Mutex::new(UploadQueue::Uninitialized),
-            metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                &harness.tenant_id,
-                &TIMELINE_ID,
-            )),
-        });
+            entered_runtime: _entered_runtime,
+            harness,
+            tenant: _tenant,
+            tenant_ctx: _tenant_ctx,
+            remote_fs_dir,
+            client,
+        } = TestSetup::new("upload_scheduling").unwrap();
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        println!("workdir: {}", harness.conf.workdir.display());
 
         let remote_timeline_dir =
             remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
@@ -1216,4 +1293,90 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+        // Setup
+
+        let TestSetup {
+            runtime,
+            harness,
+            client,
+            ..
+        } = TestSetup::new("metrics")?;
+
+        let metadata = dummy_metadata(Lsn(0x10));
+        client.init_upload_queue_for_empty_remote(&metadata)?;
+
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);
+
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        std::fs::write(
+            timeline_path.join(layer_file_name_1.file_name()),
+            &content_1,
+        )?;
+
+        #[derive(Debug, PartialEq)]
+        struct BytesStartedFinished {
+            started: Option<usize>,
+            finished: Option<usize>,
+        }
+        let get_bytes_started_stopped = || {
+            let started = client
+                .metrics
+                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            let stopped = client
+                .metrics
+                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            BytesStartedFinished {
+                started,
+                finished: stopped,
+            }
+        };
+
+        // Test
+
+        let init = get_bytes_started_stopped();
+
+        client.schedule_layer_file_upload(
+            &layer_file_name_1,
+            &LayerFileMetadata::new(content_1.len() as u64),
+        )?;
+
+        let pre = get_bytes_started_stopped();
+
+        runtime.block_on(client.wait_completion())?;
+
+        let post = get_bytes_started_stopped();
+
+        // Validate
+
+        assert_eq!(
+            init,
+            BytesStartedFinished {
+                started: None,
+                finished: None
+            }
+        );
+        assert_eq!(
+            pre,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                // assert that the _finished metric is created eagerly so that subtractions work on first sample
+                finished: Some(0),
+            }
+        );
+        assert_eq!(
+            post,
+            BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len())
+            }
+        );
+
+        Ok(())
+    }
 }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index c88b985c8e..5fed6fcf84 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -45,6 +45,8 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
     *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
     "pageserver_remote_physical_size",
+    "pageserver_remote_timeline_client_bytes_started_total",
+    "pageserver_remote_timeline_client_bytes_finished_total",
 )
 
 PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (

From cb9473928df94148b42297fe30b0b99682609249 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 25 Apr 2023 16:22:16 +0300
Subject: [PATCH 06/31] feat: add rough timings for basebackup (#4062)

just record the time needed for waiting the lsn and then the basebackup
in a log message in millis. this is related to ongoing investigations to
cold start performance.

this could also be a a counter. it cannot be added next to smgr
histograms, because we don't want another histogram per timeline.

the aim is to allow drilling deeper into which timelines were slow, and
to understand why some need two basebackups.
---
 pageserver/src/page_service.rs | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index bd38a7a2f3..135f08e846 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -700,6 +700,8 @@ impl PageServerHandler {
         full_backup: bool,
         ctx: RequestContext,
     ) -> anyhow::Result<()> {
+        let started = std::time::Instant::now();
+
         // check that the timeline exists
         let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
@@ -712,6 +714,8 @@ impl PageServerHandler {
                 .context("invalid basebackup lsn")?;
         }
 
+        let lsn_awaited_after = started.elapsed();
+
         // switch client to COPYOUT
         pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
         pgb.flush().await?;
@@ -732,7 +736,17 @@ impl PageServerHandler {
 
         pgb.write_message_noflush(&BeMessage::CopyDone)?;
         pgb.flush().await?;
-        info!("basebackup complete");
+
+        let basebackup_after = started
+            .elapsed()
+            .checked_sub(lsn_awaited_after)
+            .unwrap_or(Duration::ZERO);
+
+        info!(
+            lsn_await_millis = lsn_awaited_after.as_millis(),
+            basebackup_millis = basebackup_after.as_millis(),
+            "basebackup complete"
+        );
 
         Ok(())
     }

From dbbe032c395f7f4a8a13e4e4631adb801a09c1bd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 25 Apr 2023 15:33:30 +0200
Subject: [PATCH 07/31] neon_local: fix `tenant create -c eviction_policy:...`
 (#4004)

And add corresponding unit test.

The fix is to use `.remove()` instead of `.get()` when processing the
arugments hash map.
The code uses emptiness of the hash map to determine whether all
arguments have been processed.
This was likely a copy-paste error.


refs https://github.com/neondatabase/neon/issues/3942
---
 control_plane/src/pageserver.rs         |  4 ++--
 test_runner/regress/test_tenant_conf.py | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b700d426ba..75991045a4 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -359,8 +359,8 @@ impl PageServerNode {
                 .transpose()
                 .context("Failed to parse 'trace_read_requests' as bool")?,
             eviction_policy: settings
-                .get("eviction_policy")
-                .map(|x| serde_json::from_str(x))
+                .remove("eviction_policy")
+                .map(serde_json::from_str)
                 .transpose()
                 .context("Failed to parse 'eviction_policy' json")?,
             min_resident_size_override: settings
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 1ed86d19a2..b83bd5fc99 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,3 +1,4 @@
+import json
 from contextlib import closing
 
 import psycopg2.extras
@@ -22,6 +23,7 @@ wait_lsn_timeout='111 s';
 checkpoint_distance = 10000
 compaction_target_size = 1048576
 evictions_low_residence_duration_metric_threshold = "2 days"
+eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
 """
 
     env = neon_env_builder.init_start()
@@ -44,6 +46,7 @@ evictions_low_residence_duration_metric_threshold = "2 days"
         "checkpoint_distance": "20000",
         "gc_period": "30sec",
         "evictions_low_residence_duration_metric_threshold": "42s",
+        "eviction_policy": json.dumps({"kind": "NoEviction"}),
     }
     tenant, _ = env.neon_cli.create_tenant(conf=new_conf)
 
@@ -84,6 +87,11 @@ evictions_low_residence_duration_metric_threshold = "2 days"
     assert effective_config["image_creation_threshold"] == 3
     assert effective_config["pitr_interval"] == "7days"
     assert effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
+    assert effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "period": "20s",
+        "threshold": "23h",
+    }
 
     # check the configuration of the new tenant
     with closing(env.pageserver.connect()) as psconn:
@@ -121,6 +129,9 @@ evictions_low_residence_duration_metric_threshold = "2 days"
     assert (
         new_effective_config["evictions_low_residence_duration_metric_threshold"] == "42s"
     ), "Should override default value"
+    assert new_effective_config["eviction_policy"] == {
+        "kind": "NoEviction"
+    }, "Specific 'eviction_policy' config should override the default value"
     assert new_effective_config["compaction_target_size"] == 1048576
     assert new_effective_config["compaction_period"] == "20s"
     assert new_effective_config["compaction_threshold"] == 10
@@ -135,6 +146,9 @@ evictions_low_residence_duration_metric_threshold = "2 days"
         "compaction_period": "80sec",
         "image_creation_threshold": "2",
         "evictions_low_residence_duration_metric_threshold": "23h",
+        "eviction_policy": json.dumps(
+            {"kind": "LayerAccessThreshold", "period": "80s", "threshold": "42h"}
+        ),
     }
     env.neon_cli.config_tenant(
         tenant_id=tenant,
@@ -180,6 +194,11 @@ evictions_low_residence_duration_metric_threshold = "2 days"
     assert (
         updated_effective_config["evictions_low_residence_duration_metric_threshold"] == "23h"
     ), "Should override default value"
+    assert updated_effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "period": "1m 20s",
+        "threshold": "1day 18h",
+    }, "Specific 'eviction_policy' config should override the default value"
     assert updated_effective_config["compaction_target_size"] == 1048576
     assert updated_effective_config["compaction_threshold"] == 10
     assert updated_effective_config["gc_horizon"] == 67108864
@@ -239,6 +258,11 @@ evictions_low_residence_duration_metric_threshold = "2 days"
     assert final_effective_config["gc_period"] == "1h"
     assert final_effective_config["image_creation_threshold"] == 3
     assert final_effective_config["evictions_low_residence_duration_metric_threshold"] == "2days"
+    assert final_effective_config["eviction_policy"] == {
+        "kind": "LayerAccessThreshold",
+        "period": "20s",
+        "threshold": "23h",
+    }
 
     # restart the pageserver and ensure that the config is still correct
     env.pageserver.stop()

From 78bbbccadbc66bef6715a5a2ad1324ccacb94587 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 25 Apr 2023 16:46:52 +0200
Subject: [PATCH 08/31] Deploy proxies for preview enviroments (#4052)

## Describe your changes
Deploy `main` proxies to the preview environments
We don't deploy storage there yet, as it's tricky.

## Issue ticket number and link
https://github.com/neondatabase/cloud/issues/4737
---
 .../ansible/staging.eu-central-1.hosts.yaml   | 47 +++++++++++++
 ...u-central-1-alpha.neon-storage-broker.yaml | 52 ++++++++++++++
 .../preview-template.neon-proxy-scram.yaml    | 67 +++++++++++++++++++
 .github/workflows/deploy-dev.yml              | 52 +++++++++++++-
 4 files changed, 217 insertions(+), 1 deletion(-)
 create mode 100644 .github/ansible/staging.eu-central-1.hosts.yaml
 create mode 100644 .github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
 create mode 100644 .github/helm-values/preview-template.neon-proxy-scram.yaml

diff --git a/.github/ansible/staging.eu-central-1.hosts.yaml b/.github/ansible/staging.eu-central-1.hosts.yaml
new file mode 100644
index 0000000000..db1d1adcff
--- /dev/null
+++ b/.github/ansible/staging.eu-central-1.hosts.yaml
@@ -0,0 +1,47 @@
+storage:
+  vars:
+    bucket_name: neon-dev-storage-eu-central-1
+    bucket_region: eu-central-1
+    # We only register/update storage in one preview console and manually copy to other instances
+    console_mgmt_base_url: http://neon-internal-api.helium.aws.neon.build
+    broker_endpoint: http://storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build:50051
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      metric_collection_endpoint: http://neon-internal-api.helium.aws.neon.build/billing/api/v1/usage_events
+      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: &default_eviction_threshold "20m"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-central-1
+    ansible_aws_ssm_bucket_name: neon-dev-storage-eu-central-1
+    console_region_id: aws-eu-central-1
+    sentry_environment: staging
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-central-1.aws.neon.build:
+          ansible_host: i-011f93ec26cfba2d4
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-central-1.aws.neon.build:
+          ansible_host: i-0ff026d27babf8ddd
+        safekeeper-1.eu-central-1.aws.neon.build:
+          ansible_host: i-03983a49ee54725d9
+        safekeeper-2.eu-central-1.aws.neon.build:
+          ansible_host: i-0bd025ecdb61b0db3
diff --git a/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
new file mode 100644
index 0000000000..aaa1ec59b4
--- /dev/null
+++ b/.github/helm-values/dev-eu-central-1-alpha.neon-storage-broker.yaml
@@ -0,0 +1,52 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.alpha.eu-central-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
+
+ingress:
+  enabled: false
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
+settings:
+  sentryEnvironment: "staging"
diff --git a/.github/helm-values/preview-template.neon-proxy-scram.yaml b/.github/helm-values/preview-template.neon-proxy-scram.yaml
new file mode 100644
index 0000000000..f4bd418e28
--- /dev/null
+++ b/.github/helm-values/preview-template.neon-proxy-scram.yaml
@@ -0,0 +1,67 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/management/api/v2"
+  domain: "*.cloud.${PREVIEW_NAME}.aws.neon.build"
+  sentryEnvironment: "staging"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://neon-internal-api.${PREVIEW_NAME}.aws.neon.build/billing/api/v1/usage_events"
+  metricCollectionInterval: "1min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  neon_service: proxy-scram
+  neon_env: test
+  neon_region: ${PREVIEW_NAME}.eu-central-1
+
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: cloud.${PREVIEW_NAME}.aws.neon.build
+  httpsPort: 443
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
index b080a29f7c..fba292f0f9 100644
--- a/.github/workflows/deploy-dev.yml
+++ b/.github/workflows/deploy-dev.yml
@@ -48,7 +48,8 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        target_region: [ eu-west-1, us-east-2 ]
+        # TODO(sergey): Fix storage deploy in eu-central-1
+        target_region: [ eu-west-1, us-east-2]
     environment:
       name: dev-${{ matrix.target_region }}
     steps:
@@ -133,6 +134,53 @@ jobs:
   
       - name: Cleanup helm folder
         run: rm -rf ~/.cache
+
+  deploy-preview-proxy-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  eu-central-1
+            target_cluster: dev-eu-central-1-alpha
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Re-deploy preview proxies
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          for PREVIEW_NAME in helium argon krypton xenon radon oganesson hydrogen nitrogen oxygen fluorine chlorine; do
+            export PREVIEW_NAME
+            envsubst <.github/helm-values/preview-template.neon-proxy-scram.yaml >preview-${PREVIEW_NAME}.neon-proxy-scram.yaml
+            helm upgrade neon-proxy-scram-${PREVIEW_NAME} neondatabase/neon-proxy --namespace neon-proxy-${PREVIEW_NAME} --create-namespace --install --atomic -f preview-${PREVIEW_NAME}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          done
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
   
   deploy-storage-broker-new:
     runs-on: [ self-hosted, gen3, small ]
@@ -148,6 +196,8 @@ jobs:
             target_cluster: dev-us-east-2-beta
           - target_region:  eu-west-1
             target_cluster: dev-eu-west-1-zeta
+          - target_region:  eu-central-1
+            target_cluster: dev-central-1-alpha
     environment:
       name: dev-${{ matrix.target_region }}
     steps:

From 7f80230fd21cacfb20cae09befc7725abb9c0efe Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 25 Apr 2023 18:07:04 +0300
Subject: [PATCH 09/31] fix: stop dead_code rustc lint (#4070)

only happens without `--all-features` which is what `./run_clippy.sh`
uses.
---
 pageserver/src/http/routes.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3318e5263c..b1251123b2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1201,6 +1201,7 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     )
 }
 
+#[cfg(feature = "testing")]
 async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
     #[derive(Debug, serde::Deserialize)]
     #[serde(rename_all = "lowercase")]

From bfd45dd6713a2e9038954cf0368b1a082937b045 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 25 Apr 2023 18:41:09 +0300
Subject: [PATCH 10/31] test_tenant_config: allow ERROR from eviction task
 (#4074)

---
 test_runner/regress/test_tenant_conf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index b83bd5fc99..8677a554f7 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -27,6 +27,8 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
 """
 
     env = neon_env_builder.init_start()
+    # we configure eviction but no remote storage, there might be error lines
+    env.pageserver.allowed_errors.append(".* no remote storage configured, cannot evict layers .*")
     http_client = env.pageserver.http_client()
 
     # Check that we raise on misspelled configs

From 05ac0e2493ce18992aab525c5c7419b954c1649a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 25 Apr 2023 17:54:10 +0100
Subject: [PATCH 11/31] Login to ECR and Docker Hub at once (#4067)

- Update kaniko to 1.9.2 (from 1.7.0), problem with reproducible build is fixed
- Login to ECR and Docker Hub at once, so we can push to several
registries, it makes job `push-docker-hub` unneeded
- `push-docker-hub` replaced with `promote-images` in `needs:` clause,
Pushing images to production ECR moved to `promote-images` job
---
 .github/workflows/build_and_test.yml | 216 ++++++++++++++-------------
 1 file changed, 115 insertions(+), 101 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3212b76731..bdcf2463bc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -541,7 +541,7 @@ jobs:
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
-    needs: [ push-docker-hub, tag ]
+    needs: [ promote-images, tag ]
     steps:
       - name: Set PR's status to pending and request a remote CI test
         run: |
@@ -584,8 +584,7 @@ jobs:
   neon-image:
     runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    # https://github.com/GoogleContainerTools/kaniko/issues/2005
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -597,11 +596,32 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
 
       - name: Kaniko build neon
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
 
       # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
@@ -652,7 +672,7 @@ jobs:
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -661,18 +681,41 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v1 # v3 won't work with kaniko
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
 
       - name: Kaniko build compute tools
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --dockerfile Dockerfile.compute-tools
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
 
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
   compute-node-image:
     runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+    container: gcr.io/kaniko-project/executor:v1.9.2-debug
     needs: [ tag ]
     strategy:
       fail-fast: false
@@ -689,12 +732,36 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+      - name: Configure ECR and Docker Hub login
+        run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
+          cat <<-EOF > /kaniko/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
 
       - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run:
+          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
+                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
+                           --context .
+                           --build-arg GIT_VERSION=${{ github.sha }}
+                           --build-arg PG_VERSION=${{ matrix.version }}
+                           --dockerfile Dockerfile.compute-node
+                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
@@ -786,26 +853,45 @@ jobs:
     runs-on: [ self-hosted, gen3, small ]
     needs: [ tag, test-images, vm-compute-node-image ]
     container: golang:1.19-bullseye
-    if: github.event_name != 'workflow_dispatch'
+    # Don't add if-condition here.
+    # The job should always be run because we have dependant other jobs that shouldn't be skipped
 
     steps:
       - name: Install Crane & ECR helper
         if: |
           (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
         run: |
           go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
           go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
 
-      - name: Configure ECR login
+      - name: Configure ECR and Docker Hub login
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+           github.event_name != 'workflow_dispatch'
         run: |
+          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
+          echo "::add-mask::${DOCKERHUB_AUTH}"
+
           mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+          cat <<-EOF > /github/home/.docker/config.json
+            {
+              "auths": {
+                "https://index.docker.io/v1/": {
+                  "auth": "${DOCKERHUB_AUTH}"
+                }
+              },
+              "credHelpers": {
+                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login",
+                "093970136003.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
+              }
+            }
+          EOF
 
       - name: Add latest tag to images
         if: |
           (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
         run: |
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -814,50 +900,17 @@ jobs:
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
-
-  push-docker-hub:
-    runs-on: [ self-hosted, dev, x64 ]
-    needs: [ promote-images, tag ]
-    container: golang:1.19-bullseye
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Pull neon image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon
-
-      - name: Pull compute tools image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
-
-      - name: Pull compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
-
-      - name: Pull vm compute node v14 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-
-      - name: Pull compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15
-
-      - name: Pull vm compute node v15 image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-
-      - name: Pull rust image from ECR
-        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
+          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
       - name: Push images to production ECR
         if: |
           (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+           github.event_name != 'workflow_dispatch'
         run: |
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -866,45 +919,6 @@ jobs:
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
 
-      - name: Configure Docker Hub login
-        run: |
-          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
-          echo "" > /github/home/.docker/config.json
-          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
-
-      - name: Push neon image to Docker Hub
-        run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute tools image to Docker Hub
-        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v14 image to Docker Hub
-        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v14 image to Docker Hub
-        run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-
-      - name: Push compute node v15 image to Docker Hub
-        run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push vm compute node v15 image to Docker Hub
-        run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-
-      - name: Push rust image to Docker Hub
-        run: crane push rust neondatabase/rust:pinned
-
-      - name: Add latest tag to images in Docker Hub
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
-        run: |
-          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 
@@ -913,7 +927,7 @@ jobs:
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
     if: |
       contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
       github.event_name != 'workflow_dispatch'
@@ -947,7 +961,7 @@ jobs:
   deploy:
     runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
     if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
     steps:
       - name: Fix git ownership
@@ -984,7 +998,7 @@ jobs:
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
-    needs: [ push-docker-hub, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
     steps:
       - name: Promote compatibility snapshot for the release

From 8945fbdb31d9d28aa88194153b56eee6e4a39605 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 25 Apr 2023 20:45:36 +0300
Subject: [PATCH 12/31] Enable OpenTelemetry tracing in proxy in staging.
 (#4065)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Depends on https://github.com/neondatabase/helm-charts/pull/32

Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
---
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml     | 1 +
 .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml      | 1 +
 .../helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml  | 1 +
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml     | 1 +
 4 files changed, 4 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index a8567665d3..a7d8587ec2 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -23,6 +23,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
+  otelExporterOtlpEndpoint: "https://otel-collector.zeta.eu-west-1.internal.aws.neon.build"
   sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index feca05aff6..893e0fab10 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
   domain: "pg.neon.build"
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
   sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index 46cfdd2e69..77f6cf080e 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -24,6 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
   sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index fdd869c122..2510d624cd 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -25,6 +25,7 @@ settings:
   authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
   extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
+  otelExporterOtlpEndpoint: "https://otel-collector.beta.us-east-2.internal.aws.neon.build"
   sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"

From 2d6fd72177c89645b2b718880796a6e04ff4ebfa Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 25 Apr 2023 23:58:59 +0100
Subject: [PATCH 13/31] GitHub Workflows: Fix crane for several registries
 (#4076)

Follow-up fix after https://github.com/neondatabase/neon/pull/4067

```
+ crane tag neondatabase/vm-compute-node-v14:3064 latest
Error: fetching "neondatabase/vm-compute-node-v14:3064": GET https://index.docker.io/v2/neondatabase/vm-compute-node-v14/manifests/3064: MANIFEST_UNKNOWN: manifest unknown; unknown tag=3064
```

I reverted back the previous approach for promoting images
(login to one registry, save images to local fs, logout and login to
another registry, and push images from local fs). It turns out what
works for one Google project (kaniko), doesn't work for another (crane)
[sigh]
---
 .github/workflows/build_and_test.yml | 60 ++++++++++++++--------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bdcf2463bc..15a6a611b1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -858,35 +858,19 @@ jobs:
 
     steps:
       - name: Install Crane & ECR helper
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
         run: |
           go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
           go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
 
-      - name: Configure ECR and Docker Hub login
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+      - name: Configure ECR login
         run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
-
           mkdir /github/home/.docker/
-          cat <<-EOF > /github/home/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login",
-                "093970136003.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Copy vm-compute-node images to Docker Hub
+        run: |
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
 
       - name: Add latest tag to images
         if: |
@@ -900,13 +884,6 @@ jobs:
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
-          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-
       - name: Push images to production ECR
         if: |
           (github.ref_name == 'main' || github.ref_name == 'release') &&
@@ -919,6 +896,29 @@ jobs:
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
           crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
 
+      - name: Configure Docker Hub login
+        run: |
+          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
+          echo "" > /github/home/.docker/config.json
+          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
+
+      - name: Push vm-compute-node to Docker Hub
+        run: |
+          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+
+      - name: Push latest tags to Docker Hub
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
+        run: |
+          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+
       - name: Cleanup ECR folder
         run: rm -rf ~/.ecr
 

From 9d0cf08d5f26ba63691335f7169409454e3e608f Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 26 Apr 2023 09:29:44 +0200
Subject: [PATCH 14/31] Fix new storage-broker deploy for eu-central-1 (#4079)

---
 .github/workflows/deploy-dev.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
index fba292f0f9..5d1c6e0e16 100644
--- a/.github/workflows/deploy-dev.yml
+++ b/.github/workflows/deploy-dev.yml
@@ -197,7 +197,7 @@ jobs:
           - target_region:  eu-west-1
             target_cluster: dev-eu-west-1-zeta
           - target_region:  eu-central-1
-            target_cluster: dev-central-1-alpha
+            target_cluster: dev-eu-central-1-alpha
     environment:
       name: dev-${{ matrix.target_region }}
     steps:

From f19b70b379f426bc48fe692f368dab94f4a6af25 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 26 Apr 2023 09:36:26 +0200
Subject: [PATCH 15/31] Configure extra domain for us-east-1 (#4078)

---
 .../helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
index f113d1f861..1c7e646810 100644
--- a/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-1-theta.neon-proxy-scram.yaml
@@ -23,8 +23,8 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-east-1.aws.neon.tech"
-  # These domains haven't been delegated yet.
-  # extraDomains: ["*.us-east-1.retooldb.com", "*.us-east-1.postgres.vercel-storage.com"]
+  # *.us-east-1.retooldb.com hasn't been delegated yet.
+  extraDomains: ["*.us-east-1.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"

From 850f6b1cb9baae004a879027d91858237546c56f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 26 Apr 2023 11:49:29 +0300
Subject: [PATCH 16/31] refactor: drop pageserver_ondisk_layers (#4071)

I didn't get through #3775 fast enough so we wanted to remove this
metric.

Fixes #3705.
---
 pageserver/src/metrics.rs          | 11 +++--------
 pageserver/src/tenant/layer_map.rs |  4 ----
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index cf60a1a404..d6978a8cf6 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
 use metrics::core::{AtomicU64, GenericCounter};
 use metrics::{
     register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
-    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
-    UIntGauge, UIntGaugeVec,
+    register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec, Counter, CounterVec,
+    Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge,
+    UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::models::TenantState;
@@ -350,11 +350,6 @@ pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
-        .expect("failed to define a metric")
-});
-
 // remote storage metrics
 
 /// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 02159ee291..0ee0c6f77d 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -48,7 +48,6 @@ mod layer_coverage;
 
 use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
-use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
@@ -288,7 +287,6 @@ where
             self.l0_delta_layers.push(layer);
         }
 
-        NUM_ONDISK_LAYERS.inc();
         Ok(())
     }
 
@@ -314,8 +312,6 @@ where
                 "failed to locate removed historic layer from l0_delta_layers"
             );
         }
-
-        NUM_ONDISK_LAYERS.dec();
     }
 
     pub(self) fn replace_historic_noflush(

From 4625da316447a6bf8e345fefbfd30f860fb51074 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 26 Apr 2023 12:07:45 +0300
Subject: [PATCH 17/31] build: remove busted sk-1.us-east-2 from staging hosts
 (#4082)

this should give us complete deployments while a new one is being
brought up.
---
 .github/ansible/staging.us-east-2.hosts.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index e63ed6e639..dacc5567c3 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -48,8 +48,6 @@ storage:
       hosts:
         safekeeper-0.us-east-2.aws.neon.build:
           ansible_host: i-027662bd552bf5db0
-        safekeeper-1.us-east-2.aws.neon.build:
-          ansible_host: i-0171efc3604a7b907
         safekeeper-2.us-east-2.aws.neon.build:
           ansible_host: i-0de0b03a51676a6ce
         safekeeper-99.us-east-2.aws.neon.build:

From 381c8fca4f1d700ad5118800e6b2b3f9e33a07b5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 26 Apr 2023 12:39:17 +0300
Subject: [PATCH 18/31] feat: log how long tenant activation takes (#4080)

Adds just a counter counting up from the creation to the tenant, logged
after activation. Might help guide us with the investigation of #4025.
---
 pageserver/src/tenant.rs | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 11415b47c4..b5966b4618 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -118,6 +118,10 @@ pub struct Tenant {
     // Global pageserver config parameters
     pub conf: &'static PageServerConf,
 
+    /// The value creation timestamp, used to measure activation delay, see:
+    /// <https://github.com/neondatabase/neon/issues/4025>
+    loading_started_at: Instant,
+
     state: watch::Sender<TenantState>,
 
     // Overridden tenant-specific config parameters.
@@ -1476,7 +1480,7 @@ impl Tenant {
                 TenantState::Loading | TenantState::Attaching => {
                     *current_state = TenantState::Active;
 
-                    info!("Activating tenant {}", self.tenant_id);
+                    debug!(tenant_id = %self.tenant_id, "Activating tenant");
 
                     let timelines_accessor = self.timelines.lock().unwrap();
                     let not_broken_timelines = timelines_accessor
@@ -1487,12 +1491,17 @@ impl Tenant {
                     // down when they notice that the tenant is inactive.
                     tasks::start_background_loops(self.tenant_id);
 
+                    let mut activated_timelines = 0;
+                    let mut timelines_broken_during_activation = 0;
+
                     for timeline in not_broken_timelines {
                         match timeline
                             .activate(ctx)
                             .context("timeline activation for activating tenant")
                         {
-                            Ok(()) => {}
+                            Ok(()) => {
+                                activated_timelines += 1;
+                            }
                             Err(e) => {
                                 error!(
                                     "Failed to activate timeline {}: {:#}",
@@ -1503,9 +1512,26 @@ impl Tenant {
                                     "failed to activate timeline {}: {}",
                                     timeline.timeline_id, e
                                 ));
+
+                                timelines_broken_during_activation += 1;
                             }
                         }
                     }
+
+                    let elapsed = self.loading_started_at.elapsed();
+                    let total_timelines = timelines_accessor.len();
+
+                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                    info!(
+                        since_creation_millis = elapsed.as_millis(),
+                        tenant_id = %self.tenant_id,
+                        activated_timelines,
+                        timelines_broken_during_activation,
+                        total_timelines,
+                        post_state = <&'static str>::from(&*current_state),
+                        "activation attempt finished"
+                    );
                 }
             }
         });
@@ -1812,6 +1838,9 @@ impl Tenant {
         Tenant {
             tenant_id,
             conf,
+            // using now here is good enough approximation to catch tenants with really long
+            // activation times.
+            loading_started_at: Instant::now(),
             tenant_conf: Arc::new(RwLock::new(tenant_conf)),
             timelines: Mutex::new(HashMap::new()),
             gc_cs: tokio::sync::Mutex::new(()),

From 31a3910fd9b60043651380d58771f97558f10771 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 14 Apr 2023 12:59:08 +0400
Subject: [PATCH 19/31] Remove wait_for_sk_commit_lsn_to_reach_remote_storage.

It had a couple of inherent races:

1) Even if compute is killed before the call, some more data might still arrive
to safekeepers after commit_lsn on them is polled, advancing it. Then checkpoint
on pageserver might not include this tail, and so upload of expected LSN won't
happen until one more checkpoint.

2) commit_lsn is updated asynchronously -- compute can commit transaction before
communicating commit_lsn to even single safekeeper (sync-safekeepers can be used
to forces the advancement). This makes semantics of
wait_for_sk_commit_lsn_to_reach_remote_storage quite complicated.

Replace it with last_flush_lsn_upload which
1) Learns last flush LSN on compute;
2) Waits for it to arrive to pageserver;
3) Checkpoints it;
4) Waits for the upload.

In some tests this keeps compute alive longer than before, but this doesn't seem
to be important.

There is a chance this fixes https://github.com/neondatabase/neon/issues/3209
---
 test_runner/fixtures/neon_fixtures.py         | 40 ++++++-------------
 test_runner/fixtures/pageserver/utils.py      |  5 +--
 test_runner/regress/test_layer_eviction.py    |  9 ++---
 test_runner/regress/test_ondemand_download.py | 11 ++---
 .../test_tenants_with_remote_storage.py       |  7 +---
 5 files changed, 23 insertions(+), 49 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c6610ba062..f209dca560 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2928,32 +2928,18 @@ def fork_at_current_lsn(
     return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
 
 
-def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    safekeepers: List[Safekeeper],
-    pageserver: NeonPageserver,
-):
-    sk_commit_lsns = [
-        sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers
-    ]
-    lsn = max(sk_commit_lsns)
-    ps_http = pageserver.http_client()
-    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn)
-    return lsn
-
-
-def wait_for_sk_commit_lsn_to_reach_remote_storage(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    safekeepers: List[Safekeeper],
-    pageserver: NeonPageserver,
-):
-    lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn(
-        tenant_id, timeline_id, safekeepers, pageserver
-    )
-    ps_http = pageserver.http_client()
+def last_flush_lsn_upload(
+    env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId
+) -> Lsn:
+    """
+    Wait for pageserver to catch to the latest flush LSN of given endpoint,
+    checkpoint pageserver, and wait for it to be uploaded (remote_consistent_lsn
+    reaching flush LSN).
+    """
+    last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    ps_http = env.pageserver.http_client()
+    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
     # force a checkpoint to trigger upload
     ps_http.timeline_checkpoint(tenant_id, timeline_id)
-    wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
-    return lsn
+    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
+    return last_flush_lsn
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index c060fc8dea..7f8bb40bda 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -54,10 +54,9 @@ def wait_for_upload(
         if current_lsn >= lsn:
             log.info("wait finished")
             return
+        lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         log.info(
-            "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
-                lsn, current_lsn, i + 1
-            )
+            f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
         )
         time.sleep(1)
     raise Exception(
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 1ae32fb398..a96532c0d8 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -6,7 +6,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     RemoteStorageKind,
     wait_for_last_flush_lsn,
-    wait_for_sk_commit_lsn_to_reach_remote_storage,
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -199,7 +198,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
                     # with image_creation_threshold=1 which we will use on the last compaction
                     cur.execute("vacuum")
 
-                wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+                last_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
                 if i == 1 and j == 2 and k == 1:
                     # last iteration; stop before checkpoint to avoid leaving an inmemory layer
@@ -222,10 +221,8 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
         tenant_update_config({"image_creation_threshold": "1"})
         ps_http.timeline_compact(tenant_id, timeline_id)
 
-    # wait for all uploads to finish
-    wait_for_sk_commit_lsn_to_reach_remote_storage(
-        tenant_id, timeline_id, env.safekeepers, env.pageserver
-    )
+    # wait for all uploads to finish (checkpoint has been done above)
+    wait_for_upload(ps_http, tenant_id, timeline_id, last_lsn)
 
     # shutdown safekeepers to avoid on-demand downloads from walreceiver
     for sk in env.safekeepers:
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index cb08b014fd..5c02708457 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -12,8 +12,8 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     RemoteStorageKind,
     available_remote_storages,
+    last_flush_lsn_upload,
     wait_for_last_flush_lsn,
-    wait_for_sk_commit_lsn_to_reach_remote_storage,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -207,9 +207,7 @@ def test_ondemand_download_timetravel(
     env.endpoints.stop_all()
 
     # wait until pageserver has successfully uploaded all the data to remote storage
-    wait_for_sk_commit_lsn_to_reach_remote_storage(
-        tenant_id, timeline_id, env.safekeepers, env.pageserver
-    )
+    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
 
     def get_api_current_physical_size():
         d = client.timeline_detail(tenant_id, timeline_id)
@@ -347,12 +345,9 @@ def test_download_remote_layers_api(
         """
         )
 
+    last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
     env.endpoints.stop_all()
 
-    wait_for_sk_commit_lsn_to_reach_remote_storage(
-        tenant_id, timeline_id, env.safekeepers, env.pageserver
-    )
-
     def get_api_current_physical_size():
         d = client.timeline_detail(tenant_id, timeline_id)
         return d["current_physical_size"]
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index d7c0814570..dca2cd3d28 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -21,7 +21,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     RemoteStorageKind,
     available_remote_storages,
-    wait_for_sk_commit_lsn_to_reach_remote_storage,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.utils import (
     assert_tenant_state,
@@ -174,12 +174,9 @@ def test_tenants_attached_after_download(
     )
 
     ##### Stop the pageserver, erase its layer file to force it being downloaded from S3
+    last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
     env.endpoints.stop_all()
 
-    wait_for_sk_commit_lsn_to_reach_remote_storage(
-        tenant_id, timeline_id, env.safekeepers, env.pageserver
-    )
-
     env.pageserver.stop()
 
     timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)

From 11df2ee5d70d23bac233051d5e974d830222a967 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 26 Apr 2023 13:40:36 +0200
Subject: [PATCH 20/31] Add safekeeper-3.us-east-2.aws.neon.build (#4085)

---
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index dacc5567c3..fb218c443d 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -50,5 +50,7 @@ storage:
           ansible_host: i-027662bd552bf5db0
         safekeeper-2.us-east-2.aws.neon.build:
           ansible_host: i-0de0b03a51676a6ce
+        safekeeper-3.us-east-2.aws.neon.build:
+          ansible_host: i-05f8ba2cda243bd18
         safekeeper-99.us-east-2.aws.neon.build:
           ansible_host: i-0d61b6a2ea32028d5

From 6861259be7ee63f6a4bb2a9fdb5546147bf20389 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 26 Apr 2023 15:18:26 +0200
Subject: [PATCH 21/31] add global metric for unexpected on-demand downloads
 (#4069)

Until we have toned down the prod logs to zero WARN and ERROR, we want a
dedicated metric for which we can have a dedicated alert.

fixes https://github.com/neondatabase/neon/issues/3924
---
 pageserver/src/bin/pageserver.rs  |  1 +
 pageserver/src/lib.rs             |  2 ++
 pageserver/src/metrics.rs         | 16 ++++++++++++++++
 pageserver/src/tenant/timeline.rs |  3 ++-
 test_runner/fixtures/metrics.py   |  1 +
 5 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ed23a18ee0..8e4897c09c 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -226,6 +226,7 @@ fn start_pageserver(
     );
     set_build_info_metric(GIT_VERSION);
     set_launch_timestamp_metric(launch_ts);
+    pageserver::preinitialize_metrics();
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 278658eba3..04863886cb 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -44,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
 
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
+pub use crate::metrics::preinitialize_metrics;
+
 pub async fn shutdown_pageserver(exit_code: i32) {
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d6978a8cf6..deb20f21f8 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -205,6 +205,15 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
     .expect("failed to define a metric")
 });
 
+pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_unexpected_ondemand_downloads_count",
+        "Number of unexpected on-demand downloads. \
+         We log more context for each increment, so, forgo any labels in this metric.",
+    )
+    .expect("failed to define a metric")
+});
+
 /// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
@@ -1132,3 +1141,10 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
         poll_result
     }
 }
+
+pub fn preinitialize_metrics() {
+    // We want to alert on this metric increasing.
+    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
+    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
+    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b8b1f963e5..6c34f5a5b5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -48,7 +48,7 @@ use crate::tenant::{
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
@@ -2355,6 +2355,7 @@ impl Timeline {
                             id,
                             ctx.task_kind()
                         );
+                        UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
                         timeline.download_remote_layer(remote_layer).await?;
                         continue 'layer_map_search;
                     }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 5fed6fcf84..0e958ddd06 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -53,6 +53,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_storage_operations_seconds_global_count",
     "pageserver_storage_operations_seconds_global_sum",
     "pageserver_storage_operations_seconds_global_bucket",
+    "pageserver_unexpected_ondemand_downloads_count_total",
     "libmetrics_launch_timestamp",
     "libmetrics_build_info",
     "libmetrics_tracing_event_count_total",

From 92214578af3311c8d2ea6885f59562c9b53df628 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 26 Apr 2023 17:47:54 +0300
Subject: [PATCH 22/31] Fix proxy_io_bytes_per_client metric: use branch_id
 identifier properly. (#4084)

It fixes the miscalculation of the metric for projects that use multiple
branches for the same endpoint.
We were under billing users with such projects. So we need to
communicate the change in Release Notes.
---
 proxy/src/metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 445c2e930c..6ae1e3a447 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -95,7 +95,7 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
                     current_metrics.push((
                         Ids {
                             endpoint_id: endpoint_id.to_string(),
-                            branch_id: "".to_string(),
+                            branch_id: branch_id.to_string(),
                         },
                         (value, Utc::now()),
                     ));

From 0112a602e1b748b959bf578e7eaaecef392c09a3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 25 Apr 2023 12:22:58 +0400
Subject: [PATCH 23/31] Add timeout on proxy -> compute connection
 establishment.

Otherwise we sit up to default tcp_syn_retries (about 2+ min) before gettings os
error 110 if compute has been migrated to another pod.
---
 proxy/src/compute.rs | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b5efc72803..0465703ae6 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,8 +1,8 @@
 use crate::{cancellation::CancelClosure, error::UserFacingError};
-use futures::TryFutureExt;
+use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{io, net::SocketAddr};
+use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::NoTls;
@@ -130,9 +130,23 @@ impl ConnCfg {
     async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
         use tokio_postgres::config::Host;
 
+        // wrap TcpStream::connect with timeout
+        let connect_with_timeout = |host, port| {
+            let connection_timeout = Duration::from_millis(10000);
+            tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
+                move |res| match res {
+                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
+                    Err(_) => Err(io::Error::new(
+                        io::ErrorKind::TimedOut,
+                        format!("exceeded connection timeout {connection_timeout:?}"),
+                    )),
+                },
+            )
+        };
+
         let connect_once = |host, port| {
             info!("trying to connect to compute node at {host}:{port}");
-            TcpStream::connect((host, port)).and_then(|socket| async {
+            connect_with_timeout(host, port).and_then(|socket| async {
                 let socket_addr = socket.peer_addr()?;
                 // This prevents load balancer from severing the connection.
                 socket2::SockRef::from(&socket).set_keepalive(true)?;
@@ -165,7 +179,6 @@ impl ConnCfg {
                 Host::Unix(_) => continue, // unix sockets are not welcome here
             };
 
-            // TODO: maybe we should add a timeout.
             match connect_once(host, *port).await {
                 Ok(socket) => return Ok(socket),
                 Err(err) => {

From 9ea7b5dd38cd1fc89311eba3fcb6e8987d51e787 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 27 Apr 2023 11:54:48 +0200
Subject: [PATCH 24/31] clean up logging around on-demand downloads (#4030)

- Remove repeated tenant & timeline from span
- Demote logging of the path to debug level
- Log completion at info level, in the same function where we log errors
- distinguish between layer file download success & on-demand download
succeeding as a whole in the log message wording
- Assert that the span contains a tenant id and a timeline id

fixes https://github.com/neondatabase/neon/issues/3945

Before:

```
  INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{tenant_id=$TENANT_ID timeline_id=$TIMELINE_ID layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: download complete: /storage/pageserver/data/tenants/$TENANT_ID/timelines/$TIMELINE_ID/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91
  INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{tenant_id=$TENANT_ID timeline_id=$TIMELINE_ID layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: Rebuilt layer map. Did 9 insertions to process a batch of 1 updates.
```

After:

```
  INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: layer file download finished
  INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: Rebuilt layer map. Did 9 insertions to process a batch of 1 updates.
  INFO compaction_loop{tenant_id=$TENANT_ID}:compact_timeline{timeline=$TIMELINE_ID}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000020C8A71-00000000020CAF91}: on-demand download successful
```
---
 Cargo.lock                                    |  11 +
 Cargo.toml                                    |   1 +
 libs/remote_storage/tests/pagination_tests.rs |   6 +-
 libs/utils/Cargo.toml                         |   3 +-
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/logging.rs                     |  50 ++-
 libs/utils/src/tracing_span_assert.rs         | 287 ++++++++++++++++++
 pageserver/src/bin/pageserver.rs              |  16 +-
 pageserver/src/tenant.rs                      |   8 +-
 .../tenant/remote_timeline_client/download.rs |   5 +-
 pageserver/src/tenant/timeline.rs             |  39 ++-
 .../walreceiver/connection_manager.rs         |   2 +-
 safekeeper/src/bin/safekeeper.rs              |   5 +-
 storage_broker/src/bin/storage_broker.rs      |   5 +-
 14 files changed, 413 insertions(+), 27 deletions(-)
 create mode 100644 libs/utils/src/tracing_span_assert.rs

diff --git a/Cargo.lock b/Cargo.lock
index ce24bbcee8..08b24d263c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4629,6 +4629,16 @@ dependencies = [
  "valuable",
 ]
 
+[[package]]
+name = "tracing-error"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e"
+dependencies = [
+ "tracing",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-futures"
 version = "0.2.5"
@@ -4879,6 +4889,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tracing",
+ "tracing-error",
  "tracing-subscriber",
  "url",
  "uuid",
diff --git a/Cargo.toml b/Cargo.toml
index 0b545e6190..f4872433cd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -110,6 +110,7 @@ toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
+tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2"
diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs
index 048e99d841..86a6888f98 100644
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/pagination_tests.rs
@@ -99,7 +99,11 @@ struct S3WithTestBlobs {
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledS3 {
     async fn setup() -> Self {
-        utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed");
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
         if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
             info!(
                 "`{}` env variable is not set, skipping the test",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index dc6326e73e..2b04dfdef6 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -27,7 +27,8 @@ signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
-tracing-subscriber = { workspace = true, features = ["json"] }
+tracing-error.workspace = true
+tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
 serde_with.workspace = true
 strum.workspace = true
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index d4176911ac..9b52aa75b7 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -54,6 +54,8 @@ pub mod measured_stream;
 pub mod serde_percent;
 pub mod serde_regex;
 
+pub mod tracing_span_assert;
+
 /// use with fail::cfg("$name", "return(2000)")
 #[macro_export]
 macro_rules! failpoint_sleep_millis_async {
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index ed856b6804..2b8c852d86 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -56,7 +56,20 @@ where
     }
 }
 
-pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
+/// Whether to add the `tracing_error` crate's `ErrorLayer`
+/// to the global tracing subscriber.
+///
+pub enum TracingErrorLayerEnablement {
+    /// Do not add the `ErrorLayer`.
+    Disabled,
+    /// Add the `ErrorLayer` with the filter specified by RUST_LOG, defaulting to `info` if `RUST_LOG` is unset.
+    EnableWithRustLogFilter,
+}
+
+pub fn init(
+    log_format: LogFormat,
+    tracing_error_layer_enablement: TracingErrorLayerEnablement,
+) -> anyhow::Result<()> {
     // We fall back to printing all spans at info-level or above if
     // the RUST_LOG environment variable is not set.
     let rust_log_env_filter = || {
@@ -67,21 +80,26 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     // NB: the order of the with() calls does not matter.
     // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
     use tracing_subscriber::prelude::*;
-    tracing_subscriber::registry()
-        .with({
-            let log_layer = tracing_subscriber::fmt::layer()
-                .with_target(false)
-                .with_ansi(atty::is(atty::Stream::Stdout))
-                .with_writer(std::io::stdout);
-            let log_layer = match log_format {
-                LogFormat::Json => log_layer.json().boxed(),
-                LogFormat::Plain => log_layer.boxed(),
-                LogFormat::Test => log_layer.with_test_writer().boxed(),
-            };
-            log_layer.with_filter(rust_log_env_filter())
-        })
-        .with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()))
-        .init();
+    let r = tracing_subscriber::registry();
+    let r = r.with({
+        let log_layer = tracing_subscriber::fmt::layer()
+            .with_target(false)
+            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_writer(std::io::stdout);
+        let log_layer = match log_format {
+            LogFormat::Json => log_layer.json().boxed(),
+            LogFormat::Plain => log_layer.boxed(),
+            LogFormat::Test => log_layer.with_test_writer().boxed(),
+        };
+        log_layer.with_filter(rust_log_env_filter())
+    });
+    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
+    match tracing_error_layer_enablement {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter => r
+            .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
+            .init(),
+        TracingErrorLayerEnablement::Disabled => r.init(),
+    }
 
     Ok(())
 }
diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
new file mode 100644
index 0000000000..b9f7986442
--- /dev/null
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -0,0 +1,287 @@
+//! Assert that the current [`tracing::Span`] has a given set of fields.
+//!
+//! # Usage
+//!
+//! ```
+//! use tracing_subscriber::prelude::*;
+//! let registry = tracing_subscriber::registry()
+//!    .with(tracing_error::ErrorLayer::default());
+//!
+//! // Register the registry as the global subscriber.
+//! // In this example, we'll only use it as a thread-local subscriber.
+//! let _guard = tracing::subscriber::set_default(registry);
+//!
+//! // Then, in the main code:
+//!
+//! let span = tracing::info_span!("TestSpan", test_id = 1);
+//! let _guard = span.enter();
+//!
+//! // ... down the call stack
+//!
+//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
+//! match check_fields_present([&extractor]) {
+//!    Ok(()) => {},
+//!    Err(missing) => {
+//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
+//!    }
+//! }
+//! ```
+//!
+//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
+//!
+
+use std::{
+    collections::HashSet,
+    fmt::{self},
+    hash::{Hash, Hasher},
+};
+
+pub enum ExtractionResult {
+    Present,
+    Absent,
+}
+
+pub trait Extractor: Send + Sync + std::fmt::Debug {
+    fn name(&self) -> &str;
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
+}
+
+#[derive(Debug)]
+pub struct MultiNameExtractor<const L: usize> {
+    name: &'static str,
+    field_names: [&'static str; L],
+}
+
+impl<const L: usize> MultiNameExtractor<L> {
+    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
+        MultiNameExtractor { name, field_names }
+    }
+}
+impl<const L: usize> Extractor for MultiNameExtractor<L> {
+    fn name(&self) -> &str {
+        self.name
+    }
+    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
+        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
+            ExtractionResult::Present
+        } else {
+            ExtractionResult::Absent
+        }
+    }
+}
+
+struct MemoryIdentity<'a>(&'a dyn Extractor);
+
+impl<'a> MemoryIdentity<'a> {
+    fn as_ptr(&self) -> *const () {
+        self.0 as *const _ as *const ()
+    }
+}
+impl<'a> PartialEq for MemoryIdentity<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ptr() == other.as_ptr()
+    }
+}
+impl<'a> Eq for MemoryIdentity<'a> {}
+impl<'a> Hash for MemoryIdentity<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_ptr().hash(state);
+    }
+}
+impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
+    }
+}
+
+/// The extractor names passed as keys to [`new`].
+pub fn check_fields_present<const L: usize>(
+    must_be_present: [&dyn Extractor; L],
+) -> Result<(), Vec<&dyn Extractor>> {
+    let mut missing: HashSet<MemoryIdentity> =
+        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+    let trace = tracing_error::SpanTrace::capture();
+    trace.with_spans(|md, _formatted_fields| {
+        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+            ExtractionResult::Present => false,
+            ExtractionResult::Absent => true,
+        });
+        !missing.is_empty() // continue walking up until we've found all missing
+    });
+    if missing.is_empty() {
+        Ok(())
+    } else {
+        Err(missing.into_iter().map(|mi| mi.0).collect())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use tracing_subscriber::prelude::*;
+
+    use super::*;
+
+    struct Setup {
+        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
+        tenant_extractor: MultiNameExtractor<2>,
+        timeline_extractor: MultiNameExtractor<2>,
+    }
+
+    fn setup_current_thread() -> Setup {
+        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
+        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
+
+        let registry = tracing_subscriber::registry()
+            .with(tracing_subscriber::fmt::layer())
+            .with(tracing_error::ErrorLayer::default());
+
+        let guard = tracing::subscriber::set_default(registry);
+
+        Setup {
+            _current_thread_subscriber_guard: guard,
+            tenant_extractor,
+            timeline_extractor,
+        }
+    }
+
+    fn assert_missing(missing: Vec<&dyn Extractor>, expected: Vec<&dyn Extractor>) {
+        let missing: HashSet<MemoryIdentity> =
+            HashSet::from_iter(missing.into_iter().map(MemoryIdentity));
+        let expected: HashSet<MemoryIdentity> =
+            HashSet::from_iter(expected.into_iter().map(MemoryIdentity));
+        assert_eq!(missing, expected);
+    }
+
+    #[test]
+    fn positive_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing =
+            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn positive_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn positive_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", tenant_id = "tenant-1");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        check_fields_present([&setup.tenant_extractor]).unwrap();
+    }
+
+    #[test]
+    fn negative_subset_one_level() {
+        let setup = setup_current_thread();
+        let span = tracing::info_span!("root", timeline_id = "timeline-1");
+        let _guard = span.enter();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn negative_subset_multiple_levels() {
+        let setup = setup_current_thread();
+
+        let span = tracing::info_span!("root");
+        let _guard = span.enter();
+
+        let span = tracing::info_span!("child", timeline_id = "timeline-1");
+        let _guard = span.enter();
+
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
+        assert_missing(missing, vec![&setup.tenant_extractor]);
+    }
+
+    #[test]
+    fn tracing_error_subscriber_not_set_up() {
+        // no setup
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+
+    #[test]
+    #[should_panic]
+    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
+        let r = tracing_subscriber::registry().with({
+            tracing_error::ErrorLayer::default().with_filter(
+                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
+                    if md.is_span() && *md.level() == tracing::Level::INFO {
+                        return false;
+                    }
+                    true
+                }),
+            )
+        });
+
+        let _guard = tracing::subscriber::set_default(r);
+
+        let span = tracing::info_span!("foo", e = "some value");
+        let _guard = span.enter();
+
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 8e4897c09c..d843b01ed7 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -25,6 +25,7 @@ use pageserver::{
     virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
     auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
@@ -86,8 +87,19 @@ fn main() -> anyhow::Result<()> {
         }
     };
 
-    // Initialize logging, which must be initialized before the custom panic hook is installed.
-    logging::init(conf.log_format)?;
+    // Initialize logging.
+    //
+    // It must be initialized before the custom panic hook is installed below.
+    //
+    // Regarding tracing_error enablement: at this time, we only use the
+    // tracing_error crate to debug_assert that log spans contain tenant and timeline ids.
+    // See `debug_assert_current_span_has_tenant_and_timeline_id` in the timeline module
+    let tracing_error_layer_enablement = if cfg!(debug_assertions) {
+        TracingErrorLayerEnablement::EnableWithRustLogFilter
+    } else {
+        TracingErrorLayerEnablement::Disabled
+    };
+    logging::init(conf.log_format, tracing_error_layer_enablement)?;
 
     // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
     // disarming this hook on pageserver, because we never tear down tracing.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b5966b4618..d69d5e4b45 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2886,7 +2886,13 @@ pub mod harness {
             };
 
             LOG_HANDLE.get_or_init(|| {
-                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
+                logging::init(
+                    logging::LogFormat::Test,
+                    // enable it in case in case the tests exercise code paths that use
+                    // debug_assert_current_span_has_tenant_and_timeline_id
+                    logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+                )
+                .expect("Failed to init test logging")
             });
 
             let repo_dir = PageServerConf::test_repo_dir(test_name);
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index bda095d850..a0d8c0193a 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,6 +16,7 @@ use tracing::{info, warn};
 
 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
@@ -43,6 +44,8 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerFileName,
     layer_metadata: &'a LayerFileMetadata,
 ) -> Result<u64, DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
     let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
 
     let local_path = timeline_path.join(layer_file_name.file_name());
@@ -154,7 +157,7 @@ pub async fn download_layer_file<'a>(
         .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
         .map_err(DownloadError::Other)?;
 
-    tracing::info!("download complete: {}", local_path.display());
+    tracing::debug!("download complete: {}", local_path.display());
 
     Ok(bytes_amount)
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6c34f5a5b5..87f03f30b6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,6 +19,7 @@ use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantTimelineId;
+use utils::tracing_span_assert;
 
 use std::cmp::{max, min, Ordering};
 use std::collections::BinaryHeap;
@@ -936,6 +937,7 @@ impl Timeline {
         }
     }
 
+    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
     pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) };
         let Some(remote_layer) = layer.downcast_remote_layer() else { return  Ok(Some(false)) };
@@ -3819,11 +3821,13 @@ impl Timeline {
     /// If the caller has a deadline or needs a timeout, they can simply stop polling:
     /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
     /// So, the current download attempt will run to completion even if we stop polling.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
+    #[instrument(skip_all, fields(layer=%remote_layer.short_id()))]
     pub async fn download_remote_layer(
         &self,
         remote_layer: Arc<RemoteLayer>,
     ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         use std::sync::atomic::Ordering::Relaxed;
 
         let permit = match Arc::clone(&remote_layer.ongoing_download)
@@ -3867,6 +3871,8 @@ impl Timeline {
                     .await;
 
                 if let Ok(size) = &result {
+                    info!("layer file download finished");
+
                     // XXX the temp file is still around in Err() case
                     // and consumes space until we clean up upon pageserver restart.
                     self_clone.metrics.resident_physical_size_gauge.add(*size);
@@ -3938,6 +3944,8 @@ impl Timeline {
                     updates.flush();
                     drop(layers);
 
+                    info!("on-demand download successful");
+
                     // Now that we've inserted the download into the layer map,
                     // close the semaphore. This will make other waiters for
                     // this download return Ok(()).
@@ -3945,7 +3953,7 @@ impl Timeline {
                     remote_layer.ongoing_download.close();
                 } else {
                     // Keep semaphore open. We'll drop the permit at the end of the function.
-                    error!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
+                    error!("layer file download failed: {:?}", result.as_ref().unwrap_err());
                 }
 
                 // Don't treat it as an error if the task that triggered the download
@@ -4256,3 +4264,30 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
 
     bail!("couldn't find an unused backup number for {:?}", path)
 }
+
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
+    });
+
+    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+    });
+
+    #[cfg(debug_assertions)]
+    match tracing_span_assert::check_fields_present([
+        &*TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index efcbfbce3d..731c5c4644 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -348,7 +348,7 @@ impl ConnectionManagerState {
                 .context("walreceiver connection handling failure")
             }
             .instrument(
-                info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id),
+                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, node_id = %new_sk.safekeeper_id),
             )
         });
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index ace921a26d..3699a2a74c 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -134,7 +134,10 @@ fn main() -> anyhow::Result<()> {
     // 1. init logging
     // 2. tracing panic hook
     // 3. sentry
-    logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::init(
+        LogFormat::from_config(&args.log_format)?,
+        logging::TracingErrorLayerEnablement::Disabled,
+    )?;
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
     info!("version: {GIT_VERSION}");
 
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index de7b634ba0..597d9860d8 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -430,7 +430,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // 1. init logging
     // 2. tracing panic hook
     // 3. sentry
-    logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::init(
+        LogFormat::from_config(&args.log_format)?,
+        logging::TracingErrorLayerEnablement::Disabled,
+    )?;
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

From 5b911e1f9f6f5e49fabfb3fde12084b1e69bd4a2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 27 Apr 2023 14:01:27 +0200
Subject: [PATCH 25/31] build: run clippy for powerset of features (#4077)

This will catch compiler & clippy warnings in all feature combinations.

We should probably use cargo hack for build and test as well, but,
that's quite expensive and would add to overall CI wait times.

obsoletes https://github.com/neondatabase/neon/pull/4073
refs https://github.com/neondatabase/neon/pull/4070
---
 .github/workflows/build_and_test.yml | 17 +++++++++++++++--
 .neon_clippy_args                    |  4 ++++
 run_clippy.sh                        | 15 ++++++++++-----
 3 files changed, 29 insertions(+), 7 deletions(-)
 create mode 100644 .neon_clippy_args

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 15a6a611b1..e5ba7aa3eb 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -111,8 +111,21 @@ jobs:
       - name: Get postgres headers
         run: make postgres-headers -j$(nproc)
 
-      - name: Run cargo clippy
-        run: ./run_clippy.sh
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
       - name: Check formatting
diff --git a/.neon_clippy_args b/.neon_clippy_args
new file mode 100644
index 0000000000..25e09c61a6
--- /dev/null
+++ b/.neon_clippy_args
@@ -0,0 +1,4 @@
+# * `-A unknown_lints` – do not warn about unknown lint suppressions
+#                        that people with newer toolchains might use
+# * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
+export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
diff --git a/run_clippy.sh b/run_clippy.sh
index 9adfddedc2..ae2a17ec0c 100755
--- a/run_clippy.sh
+++ b/run_clippy.sh
@@ -1,4 +1,5 @@
-#!/bin/bash
+#!/usr/bin/env bash
+set -euo pipefail
 
 # If you save this in your path under the name "cargo-zclippy" (or whatever
 # name you like), then you can run it as "cargo zclippy" from the shell prompt.
@@ -8,7 +9,11 @@
 # warnings and errors right in the editor.
 # In vscode, this setting is Rust-analyzer>Check On Save:Command
 
-# * `-A unknown_lints` – do not warn about unknown lint suppressions
-#                        that people with newer toolchains might use
-# * `-D warnings`      - fail on any warnings (`cargo` returns non-zero exit status)
-cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
+# NB: the CI runs the full feature powerset, so, it catches slightly more errors
+# at the expense of longer runtime. This script is used by developers, so, don't
+# do that here.
+
+thisscript="${BASH_SOURCE[0]}"
+thisscript_dir="$(dirname "$thisscript")"
+CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+exec cargo clippy --all-features $CLIPPY_COMMON_ARGS

From e6ec2400fc0a8c2975b96f23c5b391064362e0da Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 27 Apr 2023 15:26:44 +0200
Subject: [PATCH 26/31] Enable hot standby PostgreSQL replicas.

Notes:
 - This still needs UI support from the Console
 - I've not tuned any GUCs for PostgreSQL to make this work better
 - Safekeeper has gotten a tweak in which WAL is sent and how: It now
sends zero-ed WAL data from the start of the timeline's first segment up to
the first byte of the timeline to be compatible with normal PostgreSQL
WAL streaming.
 - This includes the commits of #3714

Fixes one part of https://github.com/neondatabase/neon/issues/769

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 compute_tools/src/compute.rs                  |  62 +++++-
 compute_tools/src/pg_helpers.rs               |   7 +
 compute_tools/src/spec.rs                     |  16 ++
 control_plane/src/bin/neon_local.rs           |  79 ++++++-
 control_plane/src/endpoint.rs                 | 182 ++++++++++------
 control_plane/src/postgresql_conf.rs          |   2 +-
 .../var/db/postgres/specs/spec.json           |   5 -
 libs/postgres_ffi/src/lib.rs                  |   7 +-
 libs/postgres_ffi/src/pg_constants.rs         |   1 +
 libs/postgres_ffi/src/xlog_utils.rs           |  57 ++++-
 libs/utils/src/lsn.rs                         |  19 ++
 pageserver/src/basebackup.rs                  |  10 +-
 pgxn/neon/file_cache.c                        |  69 ++++++-
 pgxn/neon/libpagestore.c                      |   6 +
 pgxn/neon/neon.c                              |   1 +
 pgxn/neon/neon.h                              |   8 +
 pgxn/neon/pagestore_client.h                  |   1 +
 pgxn/neon/pagestore_smgr.c                    | 194 +++++++++++++++++-
 pgxn/neon/walproposer.c                       |  20 +-
 safekeeper/src/handler.rs                     |   9 +-
 safekeeper/src/wal_storage.rs                 |  86 +++++++-
 test_runner/fixtures/neon_fixtures.py         |  43 ++++
 test_runner/regress/test_compute_ctl.py       |   5 -
 test_runner/regress/test_hot_standby.py       |  79 +++++++
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 26 files changed, 851 insertions(+), 121 deletions(-)
 create mode 100644 test_runner/regress/test_hot_standby.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 507dac9c0d..b6bc234beb 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -249,18 +249,63 @@ impl ComputeNode {
     /// safekeepers sync, basebackup, etc.
     #[instrument(skip(self, compute_state))]
     pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+        #[derive(Clone)]
+        enum Replication {
+            Primary,
+            Static { lsn: Lsn },
+            HotStandby,
+        }
+
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
         let pgdata_path = Path::new(&self.pgdata);
 
+        let hot_replica = if let Some(option) = spec.cluster.settings.find_ref("hot_standby") {
+            if let Some(value) = &option.value {
+                anyhow::ensure!(option.vartype == "bool");
+                matches!(value.as_str(), "on" | "yes" | "true")
+            } else {
+                false
+            }
+        } else {
+            false
+        };
+
+        let replication = if hot_replica {
+            Replication::HotStandby
+        } else if let Some(lsn) = spec.cluster.settings.find("recovery_target_lsn") {
+            Replication::Static {
+                lsn: Lsn::from_str(&lsn)?,
+            }
+        } else {
+            Replication::Primary
+        };
+
         // Remove/create an empty pgdata directory and put configuration there.
         self.create_pgdata()?;
         config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
 
-        info!("starting safekeepers syncing");
-        let lsn = self
-            .sync_safekeepers(pspec.storage_auth_token.clone())
-            .with_context(|| "failed to sync safekeepers")?;
-        info!("safekeepers synced at LSN {}", lsn);
+        // Syncing safekeepers is only safe with primary nodes: if a primary
+        // is already connected it will be kicked out, so a secondary (standby)
+        // cannot sync safekeepers.
+        let lsn = match &replication {
+            Replication::Primary => {
+                info!("starting safekeepers syncing");
+                let lsn = self
+                    .sync_safekeepers(pspec.storage_auth_token.clone())
+                    .with_context(|| "failed to sync safekeepers")?;
+                info!("safekeepers synced at LSN {}", lsn);
+                lsn
+            }
+            Replication::Static { lsn } => {
+                info!("Starting read-only node at static LSN {}", lsn);
+                *lsn
+            }
+            Replication::HotStandby => {
+                info!("Initializing standby from latest Pageserver LSN");
+                Lsn(0)
+            }
+        };
 
         info!(
             "getting basebackup@{} from pageserver {}",
@@ -276,6 +321,13 @@ impl ComputeNode {
         // Update pg_hba.conf received with basebackup.
         update_pg_hba(pgdata_path)?;
 
+        match &replication {
+            Replication::Primary | Replication::Static { .. } => {}
+            Replication::HotStandby => {
+                add_standby_signal(pgdata_path)?;
+            }
+        }
+
         Ok(())
     }
 
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index bb787d0506..40dbea6907 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -94,6 +94,7 @@ impl PgOptionsSerialize for GenericOptions {
 
 pub trait GenericOptionsSearch {
     fn find(&self, name: &str) -> Option<String>;
+    fn find_ref(&self, name: &str) -> Option<&GenericOption>;
 }
 
 impl GenericOptionsSearch for GenericOptions {
@@ -103,6 +104,12 @@ impl GenericOptionsSearch for GenericOptions {
         let op = ops.iter().find(|s| s.name == name)?;
         op.value.clone()
     }
+
+    /// Lookup option by name, returning ref
+    fn find_ref(&self, name: &str) -> Option<&GenericOption> {
+        let ops = self.as_ref()?;
+        ops.iter().find(|s| s.name == name)
+    }
 }
 
 pub trait RoleExt {
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 28e0ef41b7..bf3c407202 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,3 +1,4 @@
+use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
 
@@ -145,6 +146,21 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
     Ok(())
 }
 
+/// Create a standby.signal file
+pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
+    // XXX: consider making it a part of spec.json
+    info!("adding standby.signal");
+    let signalfile = pgdata_path.join("standby.signal");
+
+    if !signalfile.exists() {
+        info!("created standby.signal");
+        File::create(signalfile)?;
+    } else {
+        info!("reused pre-existing standby.signal");
+    }
+    Ok(())
+}
+
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 665cad8783..09278e1726 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,6 +8,7 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint::Replication;
 use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
@@ -474,7 +475,14 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
             env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
 
             println!("Creating endpoint for imported timeline ...");
-            cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?;
+            cplane.new_endpoint(
+                tenant_id,
+                name,
+                timeline_id,
+                None,
+                pg_version,
+                Replication::Primary,
+            )?;
             println!("Done");
         }
         Some(("branch", branch_match)) => {
@@ -560,20 +568,20 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 .iter()
                 .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
             {
-                let lsn_str = match endpoint.lsn {
-                    None => {
-                        // -> primary endpoint
+                let lsn_str = match endpoint.replication {
+                    Replication::Static(lsn) => {
+                        // -> read-only endpoint
+                        // Use the node's LSN.
+                        lsn.to_string()
+                    }
+                    _ => {
+                        // -> primary endpoint or hot replica
                         // Use the LSN at the end of the timeline.
                         timeline_infos
                             .get(&endpoint.timeline_id)
                             .map(|bi| bi.last_record_lsn.to_string())
                             .unwrap_or_else(|| "?".to_string())
                     }
-                    Some(lsn) => {
-                        // -> read-only endpoint
-                        // Use the endpoint's LSN.
-                        lsn.to_string()
-                    }
                 };
 
                 let branch_name = timeline_name_mappings
@@ -619,7 +627,26 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 .copied()
                 .context("Failed to parse postgres version from the argument string")?;
 
-            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?;
+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
+            let replication = match (lsn, hot_standby) {
+                (Some(lsn), false) => Replication::Static(lsn),
+                (None, true) => Replication::Replica,
+                (None, false) => Replication::Primary,
+                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+            };
+
+            cplane.new_endpoint(
+                tenant_id,
+                &endpoint_id,
+                timeline_id,
+                port,
+                pg_version,
+                replication,
+            )?;
         }
         "start" => {
             let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
@@ -637,7 +664,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 None
             };
 
+            let hot_standby = sub_args
+                .get_one::<bool>("hot-standby")
+                .copied()
+                .unwrap_or(false);
+
             if let Some(endpoint) = endpoint {
+                match (&endpoint.replication, hot_standby) {
+                    (Replication::Static(_), true) => {
+                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
+                    }
+                    (Replication::Primary, true) => {
+                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                    }
+                    _ => {}
+                }
                 println!("Starting existing endpoint {endpoint_id}...");
                 endpoint.start(&auth_token)?;
             } else {
@@ -659,6 +700,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                     .get_one::<u32>("pg-version")
                     .copied()
                     .context("Failed to `pg-version` from the argument string")?;
+
+                let replication = match (lsn, hot_standby) {
+                    (Some(lsn), false) => Replication::Static(lsn),
+                    (None, true) => Replication::Replica,
+                    (None, false) => Replication::Primary,
+                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+                };
+
                 // when used with custom port this results in non obvious behaviour
                 // port is remembered from first start command, i e
                 // start --port X
@@ -670,9 +719,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                     tenant_id,
                     endpoint_id,
                     timeline_id,
-                    lsn,
                     port,
                     pg_version,
+                    replication,
                 )?;
                 ep.start(&auth_token)?;
             }
@@ -928,6 +977,12 @@ fn cli() -> Command {
         .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
         .required(false);
 
+    let hot_standby_arg = Arg::new("hot-standby")
+        .value_parser(value_parser!(bool))
+        .long("hot-standby")
+        .help("If set, the node will be a hot replica on the specified timeline")
+        .required(false);
+
     Command::new("Neon CLI")
         .arg_required_else_help(true)
         .version(GIT_VERSION)
@@ -1052,6 +1107,7 @@ fn cli() -> Command {
                             .long("config-only")
                             .required(false))
                     .arg(pg_version_arg.clone())
+                    .arg(hot_standby_arg.clone())
                 )
                 .subcommand(Command::new("start")
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1062,6 +1118,7 @@ fn cli() -> Command {
                     .arg(lsn_arg)
                     .arg(port_arg)
                     .arg(pg_version_arg)
+                    .arg(hot_standby_arg)
                 )
                 .subcommand(
                     Command::new("stop")
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 9e85138e68..7d3485518f 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -68,18 +68,19 @@ impl ComputeControlPlane {
         tenant_id: TenantId,
         name: &str,
         timeline_id: TimelineId,
-        lsn: Option<Lsn>,
         port: Option<u16>,
         pg_version: u32,
+        replication: Replication,
     ) -> Result<Arc<Endpoint>> {
         let port = port.unwrap_or_else(|| self.get_port());
+
         let ep = Arc::new(Endpoint {
             name: name.to_owned(),
             address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
             env: self.env.clone(),
             pageserver: Arc::clone(&self.pageserver),
             timeline_id,
-            lsn,
+            replication,
             tenant_id,
             pg_version,
         });
@@ -95,6 +96,18 @@ impl ComputeControlPlane {
 
 ///////////////////////////////////////////////////////////////////////////////
 
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub enum Replication {
+    // Regular read-write node
+    Primary,
+    // if recovery_target_lsn is provided, and we want to pin the node to a specific LSN
+    Static(Lsn),
+    // Hot standby; read-only replica.
+    // Future versions may want to distinguish between replicas with hot standby
+    // feedback and other kinds of replication configurations.
+    Replica,
+}
+
 #[derive(Debug)]
 pub struct Endpoint {
     /// used as the directory name
@@ -102,7 +115,7 @@ pub struct Endpoint {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
     // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary.
-    pub lsn: Option<Lsn>,
+    pub replication: Replication,
 
     // port and address of the Postgres server
     pub address: SocketAddr,
@@ -153,9 +166,17 @@ impl Endpoint {
             fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string());
         let pg_version = u32::from_str(&pg_version_str)?;
 
-        // parse recovery_target_lsn, if any
-        let recovery_target_lsn: Option<Lsn> =
-            conf.parse_field_optional("recovery_target_lsn", &context)?;
+        // parse recovery_target_lsn and primary_conninfo into Recovery Target, if any
+        let replication = if let Some(lsn_str) = conf.get("recovery_target_lsn") {
+            Replication::Static(Lsn::from_str(lsn_str)?)
+        } else if let Some(slot_name) = conf.get("primary_slot_name") {
+            let slot_name = slot_name.to_string();
+            let prefix = format!("repl_{}_", timeline_id);
+            assert!(slot_name.starts_with(&prefix));
+            Replication::Replica
+        } else {
+            Replication::Primary
+        };
 
         // ok now
         Ok(Endpoint {
@@ -164,7 +185,7 @@ impl Endpoint {
             env: env.clone(),
             pageserver: Arc::clone(pageserver),
             timeline_id,
-            lsn: recovery_target_lsn,
+            replication,
             tenant_id,
             pg_version,
         })
@@ -299,50 +320,83 @@ impl Endpoint {
         conf.append("neon.pageserver_connstring", &pageserver_connstr);
         conf.append("neon.tenant_id", &self.tenant_id.to_string());
         conf.append("neon.timeline_id", &self.timeline_id.to_string());
-        if let Some(lsn) = self.lsn {
-            conf.append("recovery_target_lsn", &lsn.to_string());
-        }
 
         conf.append_line("");
-        // Configure backpressure
-        // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
-        //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
-        //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
-        //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
-        //   updates pages are not requested from pageserver.
-        // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
-        //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
-        //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
-        //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
-        // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
-        //   To be able to restore database in case of pageserver node crash, safekeeper should not
-        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
-        //   (if they are not able to upload WAL to S3).
-        conf.append("max_replication_write_lag", "15MB");
-        conf.append("max_replication_flush_lag", "10GB");
+        // Replication-related configurations, such as WAL sending
+        match &self.replication {
+            Replication::Primary => {
+                // Configure backpressure
+                // - Replication write lag depends on how fast the walreceiver can process incoming WAL.
+                //   This lag determines latency of get_page_at_lsn. Speed of applying WAL is about 10MB/sec,
+                //   so to avoid expiration of 1 minute timeout, this lag should not be larger than 600MB.
+                //   Actually latency should be much smaller (better if < 1sec). But we assume that recently
+                //   updates pages are not requested from pageserver.
+                // - Replication flush lag depends on speed of persisting data by checkpointer (creation of
+                //   delta/image layers) and advancing disk_consistent_lsn. Safekeepers are able to
+                //   remove/archive WAL only beyond disk_consistent_lsn. Too large a lag can cause long
+                //   recovery time (in case of pageserver crash) and disk space overflow at safekeepers.
+                // - Replication apply lag depends on speed of uploading changes to S3 by uploader thread.
+                //   To be able to restore database in case of pageserver node crash, safekeeper should not
+                //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
+                //   (if they are not able to upload WAL to S3).
+                conf.append("max_replication_write_lag", "15MB");
+                conf.append("max_replication_flush_lag", "10GB");
 
-        if !self.env.safekeepers.is_empty() {
-            // Configure Postgres to connect to the safekeepers
-            conf.append("synchronous_standby_names", "walproposer");
+                if !self.env.safekeepers.is_empty() {
+                    // Configure Postgres to connect to the safekeepers
+                    conf.append("synchronous_standby_names", "walproposer");
 
-            let safekeepers = self
-                .env
-                .safekeepers
-                .iter()
-                .map(|sk| format!("localhost:{}", sk.pg_port))
-                .collect::<Vec<String>>()
-                .join(",");
-            conf.append("neon.safekeepers", &safekeepers);
-        } else {
-            // We only use setup without safekeepers for tests,
-            // and don't care about data durability on pageserver,
-            // so set more relaxed synchronous_commit.
-            conf.append("synchronous_commit", "remote_write");
+                    let safekeepers = self
+                        .env
+                        .safekeepers
+                        .iter()
+                        .map(|sk| format!("localhost:{}", sk.pg_port))
+                        .collect::<Vec<String>>()
+                        .join(",");
+                    conf.append("neon.safekeepers", &safekeepers);
+                } else {
+                    // We only use setup without safekeepers for tests,
+                    // and don't care about data durability on pageserver,
+                    // so set more relaxed synchronous_commit.
+                    conf.append("synchronous_commit", "remote_write");
 
-            // Configure the node to stream WAL directly to the pageserver
-            // This isn't really a supported configuration, but can be useful for
-            // testing.
-            conf.append("synchronous_standby_names", "pageserver");
+                    // Configure the node to stream WAL directly to the pageserver
+                    // This isn't really a supported configuration, but can be useful for
+                    // testing.
+                    conf.append("synchronous_standby_names", "pageserver");
+                }
+            }
+            Replication::Static(lsn) => {
+                conf.append("recovery_target_lsn", &lsn.to_string());
+            }
+            Replication::Replica => {
+                assert!(!self.env.safekeepers.is_empty());
+
+                // TODO: use future host field from safekeeper spec
+                // Pass the list of safekeepers to the replica so that it can connect to any of them,
+                // whichever is availiable.
+                let sk_ports = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .map(|x| x.pg_port.to_string())
+                    .collect::<Vec<_>>()
+                    .join(",");
+                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
+
+                let connstr = format!(
+                    "host={} port={} options='-c timeline_id={} tenant_id={}' application_name=replica replication=true",
+                    sk_hosts,
+                    sk_ports,
+                    &self.timeline_id.to_string(),
+                    &self.tenant_id.to_string(),
+                );
+
+                let slot_name = format!("repl_{}_", self.timeline_id);
+                conf.append("primary_conninfo", connstr.as_str());
+                conf.append("primary_slot_name", slot_name.as_str());
+                conf.append("hot_standby", "on");
+            }
         }
 
         let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
@@ -355,21 +409,27 @@ impl Endpoint {
     }
 
     fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
-        let backup_lsn = if let Some(lsn) = self.lsn {
-            Some(lsn)
-        } else if !self.env.safekeepers.is_empty() {
-            // LSN 0 means that it is bootstrap and we need to download just
-            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
-            // procedure evolves quite actively right now, so let's think about it again
-            // when things would be more stable (TODO).
-            let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
-            if lsn == Lsn(0) {
-                None
-            } else {
-                Some(lsn)
+        let backup_lsn = match &self.replication {
+            Replication::Primary => {
+                if !self.env.safekeepers.is_empty() {
+                    // LSN 0 means that it is bootstrap and we need to download just
+                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
+                    // procedure evolves quite actively right now, so let's think about it again
+                    // when things would be more stable (TODO).
+                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
+                    if lsn == Lsn(0) {
+                        None
+                    } else {
+                        Some(lsn)
+                    }
+                } else {
+                    None
+                }
+            }
+            Replication::Static(lsn) => Some(*lsn),
+            Replication::Replica => {
+                None // Take the latest snapshot available to start with
             }
-        } else {
-            None
         };
 
         self.do_basebackup(backup_lsn)?;
@@ -466,7 +526,7 @@ impl Endpoint {
         // 3. Load basebackup
         self.load_basebackup(auth_token)?;
 
-        if self.lsn.is_some() {
+        if self.replication != Replication::Primary {
             File::create(self.pgdata().join("standby.signal"))?;
         }
 
diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs
index 34dc769e78..638575eb82 100644
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -13,7 +13,7 @@ use std::io::BufRead;
 use std::str::FromStr;
 
 /// In-memory representation of a postgresql.conf file
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct PostgresConf {
     lines: Vec<String>,
     hash: HashMap<String, String>,
diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
index 10ae0b0ecf..565e5e368e 100644
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -28,11 +28,6 @@
                 "value": "replica",
                 "vartype": "enum"
             },
-            {
-                "name": "hot_standby",
-                "value": "on",
-                "vartype": "bool"
-            },
             {
                 "name": "wal_log_hints",
                 "value": "on",
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 492ec9748a..b8eb469cb0 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -95,10 +95,13 @@ pub fn generate_wal_segment(
     segno: u64,
     system_id: u64,
     pg_version: u32,
+    lsn: Lsn,
 ) -> Result<Bytes, SerializeError> {
+    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));
+
     match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id),
+        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
         _ => Err(SerializeError::BadInput),
     }
 }
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 09678353af..6bc89ed37e 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -195,6 +195,7 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
 
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
+pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;
 
 /* From fsm_internals.h */
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 272c4d6dcc..8ed00a9e13 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -270,6 +270,11 @@ impl XLogPageHeaderData {
         use utils::bin_ser::LeSer;
         XLogPageHeaderData::des_from(&mut buf.reader())
     }
+
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
+        use utils::bin_ser::LeSer;
+        self.ser().map(|b| b.into())
+    }
 }
 
 impl XLogLongPageHeaderData {
@@ -328,22 +333,32 @@ impl CheckPoint {
     }
 }
 
-//
-// Generate new, empty WAL segment.
-// We need this segment to start compute node.
-//
-pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
+/// Generate new, empty WAL segment, with correct block headers at the first
+/// page of the segment and the page that contains the given LSN.
+/// We need this segment to start compute node.
+pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Bytes, SerializeError> {
     let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE);
 
     let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+
+    let page_off = lsn.block_offset();
+    let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
+
+    let first_page_only = seg_off < XLOG_BLCKSZ;
+    let (shdr_rem_len, infoflags) = if first_page_only {
+        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD) 
+    } else {
+        (0, 0)
+    };
+
     let hdr = XLogLongPageHeaderData {
         std: {
             XLogPageHeaderData {
                 xlp_magic: XLOG_PAGE_MAGIC as u16,
-                xlp_info: pg_constants::XLP_LONG_HEADER,
+                xlp_info: pg_constants::XLP_LONG_HEADER | infoflags,
                 xlp_tli: PG_TLI,
                 xlp_pageaddr: pageaddr,
-                xlp_rem_len: 0,
+                xlp_rem_len: shdr_rem_len as u32,
                 ..Default::default() // Put 0 in padding fields.
             }
         },
@@ -357,9 +372,37 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
 
     //zero out the rest of the file
     seg_buf.resize(WAL_SEGMENT_SIZE, 0);
+    
+    if !first_page_only {
+        let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
+        let header = XLogPageHeaderData {
+            xlp_magic: XLOG_PAGE_MAGIC as u16,
+            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                pg_constants::XLP_FIRST_IS_CONTRECORD
+            } else {
+                0
+            },
+            xlp_tli: PG_TLI,
+            xlp_pageaddr: lsn.page_lsn().0,
+            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                page_off as u32
+            } else {
+                0u32
+            },
+            ..Default::default() // Put 0 in padding fields.
+        };
+        let hdr_bytes = header.encode()?;
+
+        debug_assert!(seg_buf.len() > block_offset + hdr_bytes.len());
+        debug_assert_ne!(block_offset, 0);
+
+        seg_buf[block_offset..block_offset + hdr_bytes.len()].copy_from_slice(&hdr_bytes[..]);
+    }
+
     Ok(seg_buf.freeze())
 }
 
+
 #[repr(C)]
 #[derive(Serialize)]
 struct XlLogicalMessage {
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index acf5ea28d7..0493d43088 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -62,29 +62,48 @@ impl Lsn {
     }
 
     /// Compute the offset into a segment
+    #[inline]
     pub fn segment_offset(self, seg_sz: usize) -> usize {
         (self.0 % seg_sz as u64) as usize
     }
 
     /// Compute LSN of the segment start.
+    #[inline]
     pub fn segment_lsn(self, seg_sz: usize) -> Lsn {
         Lsn(self.0 - (self.0 % seg_sz as u64))
     }
 
     /// Compute the segment number
+    #[inline]
     pub fn segment_number(self, seg_sz: usize) -> u64 {
         self.0 / seg_sz as u64
     }
 
     /// Compute the offset into a block
+    #[inline]
     pub fn block_offset(self) -> u64 {
         const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
         self.0 % BLCKSZ
     }
 
+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_lsn(self) -> Lsn {
+        Lsn(self.0 - self.block_offset())
+    }
+
+    /// Compute the block offset of the first byte of this Lsn within this
+    /// segment
+    #[inline]
+    pub fn page_offset_in_segment(self, seg_sz: usize) -> u64 {
+        (self.0 - self.block_offset()) - self.segment_lsn(seg_sz).0
+    }
+
     /// Compute the bytes remaining in this block
     ///
     /// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`.
+    #[inline]
     pub fn remaining_in_block(self) -> u64 {
         const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
         BLCKSZ - (self.0 % BLCKSZ)
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 41fa0a67bb..c666fc785c 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -463,9 +463,13 @@ where
         let wal_file_path = format!("pg_wal/{}", wal_file_name);
         let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;
 
-        let wal_seg =
-            postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
-                .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
+        let wal_seg = postgres_ffi::generate_wal_segment(
+            segno,
+            system_identifier,
+            self.timeline.pg_version,
+            self.lsn,
+        )
+        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
         ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
         self.ar.append(&header, &wal_seg[..]).await?;
         Ok(())
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 8dff259f02..cc46fb5a25 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -370,6 +370,74 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	return found;
 }
 
+/*
+ * Evict a page (if present) from the local file cache
+ */
+void
+lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+{
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;
+
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return;
+
+	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
+
+	hash = get_hash_value(lfc_hash, &tag);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, &found);
+
+	if (!found)
+	{
+		/* nothing to do */
+		LWLockRelease(lfc_lock);
+		return;
+	}
+
+	/* remove the page from the cache */
+	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
+
+	/*
+	 * If the chunk has no live entries, we can position the chunk to be
+	 * recycled first.
+	 */
+	if (entry->bitmap[chunk_offs >> 5] == 0)
+	{
+		bool has_remaining_pages;
+
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
+			if (entry->bitmap[i] != 0)
+			{
+				has_remaining_pages = true;
+				break;
+			}
+		}
+
+		/*
+		 * Put the entry at the position that is first to be reclaimed when
+		 * we have no cached pages remaining in the chunk
+		 */
+		if (!has_remaining_pages)
+		{
+			dlist_delete(&entry->lru_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
+		}
+	}
+
+	/*
+	 * Done: apart from empty chunks, we don't move chunks in the LRU when
+	 * they're empty because eviction isn't usage.
+	 */
+
+	LWLockRelease(lfc_lock);
+}
+
 /*
  * Try to read page from local cache.
  * Returns true if page is found in local cache.
@@ -528,7 +596,6 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	LWLockRelease(lfc_lock);
 }
 
-
 /*
  * Record structure holding the to be exposed cache data.
  */
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index c44e8fcda5..21330c018f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -17,6 +17,8 @@
 #include "pagestore_client.h"
 #include "fmgr.h"
 #include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "storage/buf_internals.h"
 
 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -57,6 +59,8 @@ int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
 int			readahead_buffer_size = 128;
 
+bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
 static void pageserver_flush(void);
 
 static bool
@@ -467,6 +471,8 @@ pg_init_libpagestore(void)
 		smgr_hook = smgr_neon;
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
+		old_redo_read_buffer_filter = redo_read_buffer_filter;
+		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 	lfc_init();
 }
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 5c98902554..217c1974a0 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -24,6 +24,7 @@
 
 #include "neon.h"
 #include "walproposer.h"
+#include "pagestore_client.h"
 
 PG_MODULE_MAGIC;
 void		_PG_init(void);
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 3eac8f4570..60d321a945 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -11,6 +11,7 @@
 
 #ifndef NEON_H
 #define NEON_H
+#include "access/xlogreader.h"
 
 /* GUCs */
 extern char *neon_auth_token;
@@ -20,4 +21,11 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
 
+/*
+ * Returns true if we shouldn't do REDO on that block in record indicated by
+ * block_id; false otherwise.
+ */
+extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 #endif							/* NEON_H */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index a1f05ac685..22f5cdb73a 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -207,6 +207,7 @@ extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
 extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
 extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);
 
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 5b30641856..528d4eb051 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -189,6 +189,7 @@ typedef struct PrfHashEntry {
 #define SH_DEFINE
 #define SH_DECLARE
 #include "lib/simplehash.h"
+#include "neon.h"
 
 /*
  * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
@@ -1209,6 +1210,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 
 	if (ShutdownRequestPending)
 		return;
+	/* Don't log any pages if we're not allowed to do so. */
+	if (!XLogInsertAllowed())
+		return;
 
 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1375,8 +1379,18 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN
 
 	if (RecoveryInProgress())
 	{
+		/*
+		 * We don't know if WAL has been generated but not yet replayed, so
+		 * we're conservative in our estimates about latest pages.
+		 */
 		*latest = false;
-		lsn = GetXLogReplayRecPtr(NULL);
+
+		/*
+		 * Get the last written LSN of this page.
+		 */
+		lsn = GetLastWrittenLSN(rnode, forknum, blkno);
+		lsn = nm_adjust_lsn(lsn);
+
 		elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
@@ -1559,6 +1573,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	/*
 	 * Newly created relation is empty, remember that in the relsize cache.
 	 *
+	 * Note that in REDO, this is called to make sure the relation fork exists,
+	 * but it does not truncate the relation. So, we can only update the
+	 * relsize if it didn't exist before.
+	 * 
+	 * Also, in redo, we must make sure to update the cached size of the
+	 * relation, as that is the primary source of truth for REDO's
+	 * file length considerations, and as file extension isn't (perfectly)
+	 * logged, we need to take care of that before we hit file size checks.
+	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
 	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
@@ -1566,7 +1589,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 * cache, we might call smgrnblocks() on the newly-created relation before
 	 * the creation WAL record hass been received by the page server.
 	 */
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+	if (isRedo)
+	{
+		update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+		get_cached_relsize(reln->smgr_rnode.node, forkNum,
+						   &reln->smgr_cached_nblocks[forkNum]);
+	}
+	else
+		set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1831,6 +1861,26 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		.blockNum = blkno,
 	};
 
+	/*
+	 * The redo process does not lock pages that it needs to replay but are
+	 * not in the shared buffers, so a concurrent process may request the
+	 * page after redo has decided it won't redo that page and updated the
+	 * LwLSN for that page.
+	 * If we're in hot standby we need to take care that we don't return
+	 * until after REDO has finished replaying up to that LwLSN, as the page
+	 * should have been locked up to that point.
+	 *
+	 * See also the description on neon_redo_read_buffer_filter below.
+	 *
+	 * NOTE: It is possible that the WAL redo process will still do IO due to
+	 * concurrent failed read IOs. Those IOs should never have a request_lsn
+	 * that is as large as the WAL record we're currently replaying, if it
+	 * weren't for the behaviour of the LwLsn cache that uses the highest
+	 * value of the LwLsn cache when the entry is not found. 
+	 */
+	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
+		XLogWaitForReplayOf(request_lsn);
+
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
@@ -2584,3 +2634,143 @@ smgr_init_neon(void)
 	smgr_init_standard();
 	neon_init();
 }
+
+
+/*
+ * Return whether we can skip the redo for this block.
+ * 
+ * The conditions for skipping the IO are:
+ *
+ * - The block is not in the shared buffers, and
+ * - The block is not in the local file cache
+ *
+ * ... because any subsequent read of the page requires us to read
+ * the new version of the page from the PageServer. We do not
+ * check the local file cache; we instead evict the page from LFC: it
+ * is cheaper than going through the FS calls to read the page, and
+ * limits the number of lock operations used in the REDO process.
+ *
+ * We have one exception to the rules for skipping IO: We always apply
+ * changes to shared catalogs' pages. Although this is mostly out of caution,
+ * catalog updates usually result in backends rebuilding their catalog snapshot,
+ * which means it's quite likely the modified page is going to be used soon.
+ *
+ * It is important to note that skipping WAL redo for a page also means
+ * the page isn't locked by the redo process, as there is no Buffer
+ * being returned, nor is there a buffer descriptor to lock.
+ * This means that any IO that wants to read this block needs to wait
+ * for the WAL REDO process to finish processing the WAL record before
+ * it allows the system to start reading the block, as releasing the
+ * block early could lead to phantom reads.
+ *
+ * For example, REDO for a WAL record that modifies 3 blocks could skip
+ * the first block, wait for a lock on the second, and then modify the
+ * third block. Without skipping, all blocks would be locked and phantom
+ * reads would not occur, but with skipping, a concurrent process could
+ * read block 1 with post-REDO contents and read block 3 with pre-REDO
+ * contents, where with REDO locking it would wait on block 1 and see
+ * block 3 with post-REDO contents only.
+ */
+bool
+neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
+{
+	XLogRecPtr	end_recptr = record->EndRecPtr;
+	XLogRecPtr	prev_end_recptr = record->ReadRecPtr - 1;
+	RelFileNode	rnode;
+	ForkNumber	forknum;
+	BlockNumber	blkno;
+	BufferTag	tag;
+	uint32		hash;
+	LWLock	   *partitionLock;
+	Buffer		buffer;
+	bool		no_redo_needed;
+	BlockNumber relsize;
+
+	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
+		return true;
+
+#if PG_VERSION_NUM < 150000
+	if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+#else
+	XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno);
+#endif
+
+	/*
+	 * Out of an abundance of caution, we always run redo on shared catalogs,
+	 * regardless of whether the block is stored in shared buffers.
+	 * See also this function's top comment.
+	 */
+	if (!OidIsValid(rnode.dbNode))
+		return false;
+
+	INIT_BUFFERTAG(tag, rnode, forknum, blkno);
+	hash = BufTableHashCode(&tag);
+	partitionLock = BufMappingPartitionLock(hash);
+
+	/*
+	 * Lock the partition of shared_buffers so that it can't be updated
+	 * concurrently.
+	 */
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	/* Try to find the relevant buffer */
+	buffer = BufTableLookup(&tag, hash);
+
+	no_redo_needed = buffer < 0;
+
+	/* we don't have the buffer in memory, update lwLsn past this record */
+	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
+		lfc_evict(rnode, forknum, blkno);
+	}
+	else
+	{
+		SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
+	}
+
+	LWLockRelease(partitionLock);
+
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rnode, forknum, &relsize))
+	{
+		if (relsize < blkno + 1)
+			update_cached_relsize(rnode, forknum, blkno + 1);
+	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rnode = rnode,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		Assert(nbresponse->n_blocks > blkno);
+
+		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
+
+		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
+	}
+
+	return no_redo_needed;
+}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 45037a8c01..a99be40955 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1964,18 +1964,26 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	{
 		if (safekeeper[i].appendResponse.hs.ts != 0)
 		{
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
+			HotStandbyFeedback *skhs = &safekeeper[i].appendResponse.hs;
+			if (FullTransactionIdIsNormal(skhs->xmin)
+				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
 			{
-				hs->xmin = safekeeper[i].appendResponse.hs.xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
+				hs->xmin = skhs->xmin;
+				hs->ts = skhs->ts;
 			}
-			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
+			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
+				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
 			{
-				hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
-				hs->ts = safekeeper[i].appendResponse.hs.ts;
+				hs->catalog_xmin = skhs->catalog_xmin;
+				hs->ts = skhs->ts;
 			}
 		}
 	}
+
+	if (hs->xmin.value == ~0)
+		hs->xmin = InvalidFullTransactionId;
+	if (hs->catalog_xmin.value == ~0)
+		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
 /*
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index a589fe1869..2c3d1cea0e 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -3,6 +3,7 @@
 
 use anyhow::Context;
 use std::str;
+use std::str::FromStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};
 
@@ -49,12 +50,14 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     if cmd.starts_with("START_WAL_PUSH") {
         Ok(SafekeeperPostgresCommand::StartWalPush)
     } else if cmd.starts_with("START_REPLICATION") {
-        let re =
-            Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap();
+        let re = Regex::new(
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
+        )
+        .unwrap();
         let mut caps = re.captures_iter(cmd);
         let start_lsn = caps
             .next()
-            .map(|cap| cap[1].parse::<Lsn>())
+            .map(|cap| Lsn::from_str(&cap[1]))
             .context("parse start LSN from START_REPLICATION command")??;
         Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
     } else if cmd.starts_with("IDENTIFY_SYSTEM") {
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 9b385630c2..54e27714ea 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -18,6 +18,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF
 use postgres_ffi::{XLogSegNo, PG_TLI};
 use std::cmp::{max, min};
 
+use bytes::Bytes;
 use std::fs::{self, remove_file, File, OpenOptions};
 use std::io::Write;
 use std::path::{Path, PathBuf};
@@ -36,6 +37,7 @@ use postgres_ffi::XLOG_BLCKSZ;
 
 use postgres_ffi::waldecoder::WalStreamDecoder;
 
+use pq_proto::SystemId;
 use tokio::io::{AsyncReadExt, AsyncSeekExt};
 
 pub trait Storage {
@@ -478,6 +480,13 @@ pub struct WalReader {
 
     // We don't have WAL locally if LSN is less than local_start_lsn
     local_start_lsn: Lsn,
+    // We will respond with zero-ed bytes before this Lsn as long as
+    // pos is in the same segment as timeline_start_lsn.
+    timeline_start_lsn: Lsn,
+    // integer version number of PostgreSQL, e.g. 14; 15; 16
+    pg_version: u32,
+    system_id: SystemId,
+    timeline_start_segment: Option<Bytes>,
 }
 
 impl WalReader {
@@ -488,19 +497,27 @@ impl WalReader {
         start_pos: Lsn,
         enable_remote_read: bool,
     ) -> Result<Self> {
-        if start_pos < state.timeline_start_lsn {
+        if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
+            bail!("state uninitialized, no data to read");
+        }
+
+        // TODO: Upgrade to bail!() once we know this couldn't possibly happen
+        if state.timeline_start_lsn == Lsn(0) {
+            warn!("timeline_start_lsn uninitialized before initializing wal reader");
+        }
+
+        if start_pos
+            < state
+                .timeline_start_lsn
+                .segment_lsn(state.server.wal_seg_size as usize)
+        {
             bail!(
-                "Requested streaming from {}, which is before the start of the timeline {}",
+                "Requested streaming from {}, which is before the start of the timeline {}, and also doesn't start at the first segment of that timeline",
                 start_pos,
                 state.timeline_start_lsn
             );
         }
 
-        // TODO: add state.timeline_start_lsn == Lsn(0) check
-        if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
-            bail!("state uninitialized, no data to read");
-        }
-
         Ok(Self {
             workdir,
             timeline_dir,
@@ -509,10 +526,65 @@ impl WalReader {
             wal_segment: None,
             enable_remote_read,
             local_start_lsn: state.local_start_lsn,
+            timeline_start_lsn: state.timeline_start_lsn,
+            pg_version: state.server.pg_version / 10000,
+            system_id: state.server.system_id,
+            timeline_start_segment: None,
         })
     }
 
     pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
+        // If this timeline is new, we may not have a full segment yet, so
+        // we pad the first bytes of the timeline's first WAL segment with 0s
+        if self.pos < self.timeline_start_lsn {
+            debug_assert_eq!(
+                self.pos.segment_number(self.wal_seg_size),
+                self.timeline_start_lsn.segment_number(self.wal_seg_size)
+            );
+
+            // All bytes after timeline_start_lsn are in WAL, but those before
+            // are not, so we manually construct an empty segment for the bytes
+            // not available in this timeline.
+            if self.timeline_start_segment.is_none() {
+                let it = postgres_ffi::generate_wal_segment(
+                    self.timeline_start_lsn.segment_number(self.wal_seg_size),
+                    self.system_id,
+                    self.pg_version,
+                    self.timeline_start_lsn,
+                )?;
+                self.timeline_start_segment = Some(it);
+            }
+
+            assert!(self.timeline_start_segment.is_some());
+            let segment = self.timeline_start_segment.take().unwrap();
+
+            let seg_bytes = &segment[..];
+
+            // How much of the current segment have we already consumed?
+            let pos_seg_offset = self.pos.segment_offset(self.wal_seg_size);
+
+            // How many bytes may we consume in total?
+            let tl_start_seg_offset = self.timeline_start_lsn.segment_offset(self.wal_seg_size);
+
+            debug_assert!(seg_bytes.len() > pos_seg_offset);
+            debug_assert!(seg_bytes.len() > tl_start_seg_offset);
+
+            // Copy as many bytes as possible into the buffer
+            let len = (tl_start_seg_offset - pos_seg_offset).min(buf.len());
+            buf[0..len].copy_from_slice(&seg_bytes[pos_seg_offset..pos_seg_offset + len]);
+
+            self.pos += len as u64;
+
+            // If we're done with the segment, we can release it's memory.
+            // However, if we're not yet done, store it so that we don't have to
+            // construct the segment the next time this function is called.
+            if self.pos < self.timeline_start_lsn {
+                self.timeline_start_segment = Some(segment);
+            }
+
+            return Ok(len);
+        }
+
         let mut wal_segment = match self.wal_segment.take() {
             Some(reader) => reader,
             None => self.open_segment().await?,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f209dca560..a46c19d7fd 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1451,6 +1451,7 @@ class NeonCli(AbstractNeonCli):
         branch_name: str,
         endpoint_id: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
+        hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         port: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
@@ -1470,6 +1471,8 @@ class NeonCli(AbstractNeonCli):
             args.extend(["--port", str(port)])
         if endpoint_id is not None:
             args.append(endpoint_id)
+        if hot_standby:
+            args.extend(["--hot-standby", "true"])
 
         res = self.raw_cli(args)
         res.check_returncode()
@@ -2206,6 +2209,7 @@ class Endpoint(PgProtocol):
         super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres")
         self.env = env
         self.running = False
+        self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
         self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
@@ -2217,6 +2221,7 @@ class Endpoint(PgProtocol):
         self,
         branch_name: str,
         endpoint_id: Optional[str] = None,
+        hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         config_lines: Optional[List[str]] = None,
     ) -> "Endpoint":
@@ -2231,12 +2236,14 @@ class Endpoint(PgProtocol):
         if endpoint_id is None:
             endpoint_id = self.env.generate_endpoint_id()
         self.endpoint_id = endpoint_id
+        self.branch_name = branch_name
 
         self.env.neon_cli.endpoint_create(
             branch_name,
             endpoint_id=self.endpoint_id,
             tenant_id=self.tenant_id,
             lsn=lsn,
+            hot_standby=hot_standby,
             port=self.port,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
@@ -2361,6 +2368,7 @@ class Endpoint(PgProtocol):
         self,
         branch_name: str,
         endpoint_id: Optional[str] = None,
+        hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
         config_lines: Optional[List[str]] = None,
     ) -> "Endpoint":
@@ -2375,6 +2383,7 @@ class Endpoint(PgProtocol):
             branch_name=branch_name,
             endpoint_id=endpoint_id,
             config_lines=config_lines,
+            hot_standby=hot_standby,
             lsn=lsn,
         ).start()
 
@@ -2408,6 +2417,7 @@ class EndpointFactory:
         endpoint_id: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
         lsn: Optional[Lsn] = None,
+        hot_standby: bool = False,
         config_lines: Optional[List[str]] = None,
     ) -> Endpoint:
         ep = Endpoint(
@@ -2421,6 +2431,7 @@ class EndpointFactory:
         return ep.create_start(
             branch_name=branch_name,
             endpoint_id=endpoint_id,
+            hot_standby=hot_standby,
             config_lines=config_lines,
             lsn=lsn,
         )
@@ -2431,6 +2442,7 @@ class EndpointFactory:
         endpoint_id: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
         lsn: Optional[Lsn] = None,
+        hot_standby: bool = False,
         config_lines: Optional[List[str]] = None,
     ) -> Endpoint:
         ep = Endpoint(
@@ -2449,6 +2461,7 @@ class EndpointFactory:
             branch_name=branch_name,
             endpoint_id=endpoint_id,
             lsn=lsn,
+            hot_standby=hot_standby,
             config_lines=config_lines,
         )
 
@@ -2458,6 +2471,36 @@ class EndpointFactory:
 
         return self
 
+    def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]):
+        branch_name = origin.branch_name
+        assert origin in self.endpoints
+        assert branch_name is not None
+
+        return self.create(
+            branch_name=branch_name,
+            endpoint_id=endpoint_id,
+            tenant_id=origin.tenant_id,
+            lsn=None,
+            hot_standby=True,
+            config_lines=config_lines,
+        )
+
+    def new_replica_start(
+        self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None
+    ):
+        branch_name = origin.branch_name
+        assert origin in self.endpoints
+        assert branch_name is not None
+
+        return self.create_start(
+            branch_name=branch_name,
+            endpoint_id=endpoint_id,
+            tenant_id=origin.tenant_id,
+            lsn=None,
+            hot_standby=True,
+            config_lines=config_lines,
+        )
+
 
 @dataclass
 class SafekeeperPort:
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index aa99a01c83..d72ffe078d 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -59,11 +59,6 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
                 "value": "replica",
                 "vartype": "enum"
             },
-            {
-                "name": "hot_standby",
-                "value": "on",
-                "vartype": "bool"
-            },
             {
                 "name": "neon.safekeepers",
                 "value": """
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
new file mode 100644
index 0000000000..12e034cea2
--- /dev/null
+++ b/test_runner/regress/test_hot_standby.py
@@ -0,0 +1,79 @@
+import pytest
+from fixtures.neon_fixtures import NeonEnv
+
+
+@pytest.mark.timeout(1800)
+def test_hot_standby(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
+            primary_lsn = None
+            cought_up = False
+            queries = [
+                "SHOW neon.timeline_id",
+                "SHOW neon.tenant_id",
+                "SELECT relname FROM pg_class WHERE relnamespace = current_schema()::regnamespace::oid",
+                "SELECT COUNT(*), SUM(i) FROM test",
+            ]
+            responses = dict()
+
+            with primary.connect() as p_con:
+                with p_con.cursor() as p_cur:
+                    p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
+
+                # Explicit commit to make sure other connections (and replicas) can
+                # see the changes of this commit.
+                p_con.commit()
+
+                with p_con.cursor() as p_cur:
+                    p_cur.execute("SELECT pg_current_wal_insert_lsn()::text")
+                    res = p_cur.fetchone()
+                    assert res is not None
+                    (lsn,) = res
+                    primary_lsn = lsn
+
+                # Explicit commit to make sure other connections (and replicas) can
+                # see the changes of this commit.
+                # Note that this may generate more WAL if the transaction has changed
+                # things, but we don't care about that.
+                p_con.commit()
+
+                for query in queries:
+                    with p_con.cursor() as p_cur:
+                        p_cur.execute(query)
+                        res = p_cur.fetchone()
+                        assert res is not None
+                        response = res
+                        responses[query] = response
+
+            with secondary.connect() as s_con:
+                with s_con.cursor() as s_cur:
+                    s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
+                    res = s_cur.fetchone()
+                    assert res is not None
+
+                while not cought_up:
+                    with s_con.cursor() as secondary_cursor:
+                        secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
+                        res = secondary_cursor.fetchone()
+                        assert res is not None
+                        (secondary_lsn,) = res
+                        # There may be more changes on the primary after we got our LSN
+                        # due to e.g. autovacuum, but that shouldn't impact the content
+                        # of the tables, so we check whether we've replayed up to at
+                        # least after the commit of the `test` table.
+                        cought_up = secondary_lsn >= primary_lsn
+
+                # Explicit commit to flush any transient transaction-level state.
+                s_con.commit()
+
+                for query in queries:
+                    with s_con.cursor() as secondary_cursor:
+                        secondary_cursor.execute(query)
+                        response = secondary_cursor.fetchone()
+                        assert response is not None
+                        assert response == responses[query]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 3e70693c91..a2daebc6b4 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 3e70693c9178878404d14a61c96b15b74eb02688
+Subproject commit a2daebc6b445dcbcca9c18e1711f47c1db7ffb04
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 4ad87b0f36..aee72b7be9 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 4ad87b0f364a2313600c1d9774ca33df00e606f4
+Subproject commit aee72b7be903e52d9bdc6449aa4c17fb852d8708

From 3be81dd36bcda1288ad25ea2ff5d3acd8b26b24f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 27 Apr 2023 16:07:25 +0200
Subject: [PATCH 27/31] fix `clippy --release` failure introduced in #4030
 (#4095)

PR `build: run clippy for powerset of features (#4077)` brought us a
`clippy --release` pass.

It was merged after #4030, which fails under `clippy --release` with

```
error: static `TENANT_ID_EXTRACTOR` is never used
    --> pageserver/src/tenant/timeline.rs:4270:16
     |
4270 |     pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
     |                ^^^^^^^^^^^^^^^^^^^
     |
     = note: `-D dead-code` implied by `-D warnings`

error: static `TIMELINE_ID_EXTRACTOR` is never used
    --> pageserver/src/tenant/timeline.rs:4276:16
     |
4276 |     pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
     |                ^^^^^^^^^^^^^^^^^^^^^
```

A merge queue would have prevented this.
---
 pageserver/src/tenant/timeline.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 87f03f30b6..5c671ffd63 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,7 +19,6 @@ use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantTimelineId;
-use utils::tracing_span_assert;
 
 use std::cmp::{max, min, Ordering};
 use std::collections::BinaryHeap;
@@ -4265,8 +4264,15 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
     bail!("couldn't find an unused backup number for {:?}", path)
 }
 
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
 #[inline]
 pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    use utils::tracing_span_assert;
+
     pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
         tracing_span_assert::MultiNameExtractor<2>,
     > = once_cell::sync::Lazy::new(|| {
@@ -4279,7 +4285,6 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
         tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
     });
 
-    #[cfg(debug_assertions)]
     match tracing_span_assert::check_fields_present([
         &*TENANT_ID_EXTRACTOR,
         &*TIMELINE_ID_EXTRACTOR,

From f5b4697c90cb37cc1386b77b03cc2a013fde1af3 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 5 Apr 2023 14:55:55 +0400
Subject: [PATCH 28/31] Log session_id when proxy per client task errors out.

---
 proxy/src/proxy.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 9945e3697f..1169d76160 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -95,9 +95,9 @@ pub async fn task_main(
 
                         handle_client(config, &cancel_map, session_id, socket).await
                     }
-                    .unwrap_or_else(|e| {
+                    .unwrap_or_else(move |e| {
                         // Acknowledge that the task has finished with an error.
-                        error!("per-client task finished with an error: {e:#}");
+                        error!(?session_id, "per-client task finished with an error: {e:#}");
                     }),
                 );
             }

From d1e86d65dc64635bf0a3cef1aaa26766e683cd4c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 27 Apr 2023 17:27:21 +0300
Subject: [PATCH 29/31] Run rustfmt to fix whitespace.

Commit e6ec2400fc introduced some trivial whitespace issues.
---
 libs/postgres_ffi/src/xlog_utils.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 8ed00a9e13..4d7bb61883 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -346,7 +346,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
 
     let first_page_only = seg_off < XLOG_BLCKSZ;
     let (shdr_rem_len, infoflags) = if first_page_only {
-        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD) 
+        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
     } else {
         (0, 0)
     };
@@ -372,7 +372,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
 
     //zero out the rest of the file
     seg_buf.resize(WAL_SEGMENT_SIZE, 0);
-    
+
     if !first_page_only {
         let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
         let header = XLogPageHeaderData {
@@ -402,7 +402,6 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
     Ok(seg_buf.freeze())
 }
 
-
 #[repr(C)]
 #[derive(Serialize)]
 struct XlLogicalMessage {

From fdf5e4db5ea15550d2cf26109db504092179958b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 27 Apr 2023 18:51:57 +0300
Subject: [PATCH 30/31] refactor: Cleanup page service (#4097)

Refactoring part of #4093.

Numerious `Send + Sync` bounds were a distraction, that were not needed
at all. The proper `Bytes` usage and one `"error_message".to_string()`
are just drive-by fixes.

Not using the `PostgresBackendTCP` allows us to start setting read
timeouts (and more). `PostgresBackendTCP` is still used from proxy, so
it cannot be removed.
---
 pageserver/src/import_datadir.rs |  8 ++--
 pageserver/src/page_service.rs   | 64 +++++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 39e434a023..936de35eb9 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -114,7 +114,7 @@ async fn import_rel(
     path: &Path,
     spcoid: Oid,
     dboid: Oid,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
     len: usize,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -200,7 +200,7 @@ async fn import_slru(
     modification: &mut DatadirModification<'_>,
     slru: SlruKind,
     path: &Path,
-    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
+    reader: &mut (impl AsyncRead + Unpin),
     len: usize,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -612,8 +612,8 @@ async fn import_file(
     Ok(None)
 }
 
-async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
+async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes> {
     let mut buf: Vec<u8> = vec![];
     reader.read_to_end(&mut buf).await?;
-    Ok(Bytes::copy_from_slice(&buf[..]))
+    Ok(Bytes::from(buf))
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 135f08e846..3610704f2c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -20,7 +20,6 @@ use pageserver_api::models::{
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
-use postgres_backend::PostgresBackendTCP;
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -32,6 +31,7 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -57,7 +57,10 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<Bytes>> + '_ {
+fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
+where
+    IO: AsyncRead + AsyncWrite + Unpin,
+{
     async_stream::try_stream! {
         loop {
             let msg = tokio::select! {
@@ -65,8 +68,8 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<
 
                 _ = task_mgr::shutdown_watcher() => {
                     // We were requested to shut down.
-                    let msg = "pageserver is shutting down".to_string();
-                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None));
+                    let msg = "pageserver is shutting down";
+                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
                     Err(QueryError::Other(anyhow::anyhow!(msg)))
                 }
 
@@ -125,7 +128,7 @@ fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<
 ///
 /// XXX: Currently, any trailing data after the EOF marker prints a warning.
 /// Perhaps it should be a hard error?
-async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
     use tokio::io::AsyncReadExt;
     let mut buf = [0u8; 512];
 
@@ -245,12 +248,14 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
+    let peer_addr = socket.peer_addr().context("get peer address")?;
+
     // XXX: pgbackend.run() should take the connection_ctx,
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
     let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
         .run(&mut conn_handler, task_mgr::shutdown_watcher)
@@ -332,13 +337,16 @@ impl PageServerHandler {
     }
 
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_pagerequests(
+    async fn handle_pagerequests<IO>(
         &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<()>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
         // NOTE: pagerequests handler exits when connection is closed,
         //       so there is no need to reset the association
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
@@ -436,16 +444,19 @@ impl PageServerHandler {
 
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_basebackup(
+    async fn handle_import_basebackup<IO>(
         &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
         ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
@@ -486,15 +497,18 @@ impl PageServerHandler {
     }
 
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_import_wal(
+    async fn handle_import_wal<IO>(
         &self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
         ctx: RequestContext,
-    ) -> Result<(), QueryError> {
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
@@ -690,16 +704,19 @@ impl PageServerHandler {
 
     #[allow(clippy::too_many_arguments)]
     #[instrument(skip(self, pgb, ctx))]
-    async fn handle_basebackup_request(
+    async fn handle_basebackup_request<IO>(
         &mut self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         ctx: RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<()>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
         let started = std::time::Instant::now();
 
         // check that the timeline exists
@@ -770,10 +787,13 @@ impl PageServerHandler {
 }
 
 #[async_trait::async_trait]
-impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
+impl<IO> postgres_backend::Handler<IO> for PageServerHandler
+where
+    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+{
     fn check_auth_jwt(
         &mut self,
-        _pgb: &mut PostgresBackendTCP,
+        _pgb: &mut PostgresBackend<IO>,
         jwt_response: &[u8],
     ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
@@ -801,7 +821,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
 
     fn startup(
         &mut self,
-        _pgb: &mut PostgresBackendTCP,
+        _pgb: &mut PostgresBackend<IO>,
         _sm: &FeStartupPacket,
     ) -> Result<(), QueryError> {
         Ok(())
@@ -809,7 +829,7 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
 
     async fn process_query(
         &mut self,
-        pgb: &mut PostgresBackendTCP,
+        pgb: &mut PostgresBackend<IO>,
         query_string: &str,
     ) -> Result<(), QueryError> {
         let ctx = self.connection_ctx.attached_child();

From c4e1cafb6304f4cc7e2c65b77b6e3b4ef4afb17b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 27 Apr 2023 17:08:00 +0100
Subject: [PATCH 31/31] scripts/flaky_tests.py: handle connection error (#4096)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Increase `connect_timeout` to 30s, which should be enough for
most of the cases
- If the script cannot connect to the DB (or any other
`psycopg2.OperationalError` occur) — do not fail the script, log
the error and proceed. Problems with fetching flaky tests shouldn't
block the PR
---
 scripts/flaky_tests.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 829cc814e8..262950b61d 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -42,12 +42,16 @@ def main(args: argparse.Namespace):
     res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
     res = defaultdict(lambda: defaultdict(dict))
 
-    logging.info("connecting to the database...")
-    with psycopg2.connect(connstr, connect_timeout=10) as conn:
-        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            logging.info("fetching flaky tests...")
-            cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
-            rows = cur.fetchall()
+    try:
+        logging.info("connecting to the database...")
+        with psycopg2.connect(connstr, connect_timeout=30) as conn:
+            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                logging.info("fetching flaky tests...")
+                cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
+                rows = cur.fetchall()
+    except psycopg2.OperationalError as exc:
+        logging.error("cannot fetch flaky tests from the DB due to an error", exc)
+        rows = []
 
     for row in rows:
         logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")