From cb619449822d223a5012feb292ba2ea9975df33f Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Fri, 16 Dec 2022 19:39:38 +0200
Subject: [PATCH 01/42] Safekeeper: refactor auth validation

* Load public auth key on startup and store it in the config.
* Get rid of a separate `auth` parameter which was passed all over the place.
---
 safekeeper/src/bin/safekeeper.rs | 34 +++++++++++++++-----------------
 safekeeper/src/handler.rs        | 10 ++++------
 safekeeper/src/http/routes.rs    |  8 +++-----
 safekeeper/src/lib.rs            |  6 ++++--
 safekeeper/src/wal_service.rs    | 33 +++++++++----------------------
 5 files changed, 36 insertions(+), 55 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 5ad88276e8..394a4815bb 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -143,6 +143,19 @@ fn main() -> anyhow::Result<()> {
         return Ok(());
     }
 
+    let auth = match args.auth_validation_public_key_path.as_ref() {
+        None => {
+            info!("auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!("loading JWT auth key from {}", path.display());
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+
     let conf = SafeKeeperConf {
         workdir,
         my_id: id,
@@ -156,7 +169,7 @@ fn main() -> anyhow::Result<()> {
         max_offloader_lag_bytes: args.max_offloader_lag,
         backup_runtime_threads: args.wal_backup_threads,
         wal_backup_enabled: !args.disable_wal_backup,
-        auth_validation_public_key_path: args.auth_validation_public_key_path,
+        auth,
     };
 
     // initialize sentry if SENTRY_DSN is provided
@@ -186,19 +199,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         e
     })?;
 
-    let auth = match conf.auth_validation_public_key_path.as_ref() {
-        None => {
-            info!("auth is disabled");
-            None
-        }
-        Some(path) => {
-            info!("loading JWT auth key from {}", path.display());
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
-        }
-    };
-
     // Register metrics collector for active timelines. It's important to do this
     // after daemonizing, otherwise process collector will be upset.
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
@@ -212,12 +212,11 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
 
     let conf_ = conf.clone();
-    let auth_ = auth.clone();
     threads.push(
         thread::Builder::new()
             .name("http_endpoint_thread".into())
             .spawn(|| {
-                let router = http::make_router(conf_, auth_);
+                let router = http::make_router(conf_);
                 endpoint::serve_thread_main(
                     router,
                     http_listener,
@@ -231,7 +230,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let safekeeper_thread = thread::Builder::new()
         .name("safekeeper thread".into())
         .spawn(|| {
-            if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) {
+            if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) {
                 info!("safekeeper thread terminated: {e}");
             }
         })
@@ -244,7 +243,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         thread::Builder::new()
             .name("broker thread".into())
             .spawn(|| {
-                // TODO: add auth?
                 broker::thread_main(conf_);
             })?,
     );
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 05527303ca..c692e9fc12 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -15,9 +15,8 @@ use regex::Regex;
 
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use std::str;
-use std::sync::Arc;
 use tracing::info;
-use utils::auth::{Claims, JwtAuth, Scope};
+use utils::auth::{Claims, Scope};
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -32,7 +31,6 @@ pub struct SafekeeperPostgresHandler {
     pub tenant_id: Option<TenantId>,
     pub timeline_id: Option<TimelineId>,
     pub ttid: TenantTimelineId,
-    auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 }
 
@@ -107,6 +105,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
+            .conf
             .auth
             .as_ref()
             .unwrap()
@@ -166,14 +165,13 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
 }
 
 impl SafekeeperPostgresHandler {
-    pub fn new(conf: SafeKeeperConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    pub fn new(conf: SafeKeeperConf) -> Self {
         SafekeeperPostgresHandler {
             conf,
             appname: None,
             tenant_id: None,
             timeline_id: None,
             ttid: TenantTimelineId::empty(),
-            auth,
             claims: None,
         }
     }
@@ -181,7 +179,7 @@ impl SafekeeperPostgresHandler {
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
     fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
-        if self.auth.is_none() {
+        if self.conf.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
         }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a9a9eb3388..a917d61678 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -277,12 +277,9 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
 }
 
 /// Safekeeper http router.
-pub fn make_router(
-    conf: SafeKeeperConf,
-    auth: Option<Arc<JwtAuth>>,
-) -> RouterBuilder<hyper::Body, ApiError> {
+pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
-    if auth.is_some() {
+    if conf.auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
             #[allow(clippy::mutable_key_type)]
             static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -298,6 +295,7 @@ pub fn make_router(
 
     // NB: on any changes do not forget to update the OpenAPI spec
     // located nearby (/safekeeper/src/http/openapi_spec.yaml).
+    let auth = conf.auth.clone();
     router
         .data(Arc::new(conf))
         .data(auth)
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 5decfe64de..891d73533f 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -24,7 +24,9 @@ pub mod wal_service;
 pub mod wal_storage;
 
 mod timelines_global_map;
+use std::sync::Arc;
 pub use timelines_global_map::GlobalTimelines;
+use utils::auth::JwtAuth;
 
 pub mod defaults {
     pub use safekeeper_api::{
@@ -57,7 +59,7 @@ pub struct SafeKeeperConf {
     pub max_offloader_lag_bytes: u64,
     pub backup_runtime_threads: Option<usize>,
     pub wal_backup_enabled: bool,
-    pub auth_validation_public_key_path: Option<PathBuf>,
+    pub auth: Option<Arc<JwtAuth>>,
 }
 
 impl SafeKeeperConf {
@@ -87,7 +89,7 @@ impl SafeKeeperConf {
             broker_keepalive_interval: Duration::from_secs(5),
             backup_runtime_threads: None,
             wal_backup_enabled: true,
-            auth_validation_public_key_path: None,
+            auth: None,
             heartbeat_timeout: Duration::new(5, 0),
             max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
         }
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index fd8f9d9dcf..0fea00fe1b 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -5,32 +5,25 @@
 use anyhow::Result;
 use regex::Regex;
 use std::net::{TcpListener, TcpStream};
-use std::sync::Arc;
 use std::thread;
 use tracing::*;
-use utils::auth::JwtAuth;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::SafeKeeperConf;
 use utils::postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(
-    conf: SafeKeeperConf,
-    listener: TcpListener,
-    auth: Option<Arc<JwtAuth>>,
-) -> Result<()> {
+pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
     loop {
         match listener.accept() {
             Ok((socket, peer_addr)) => {
                 debug!("accepted connection from {}", peer_addr);
                 let conf = conf.clone();
 
-                let auth = auth.clone();
                 let _ = thread::Builder::new()
                     .name("WAL service thread".into())
                     .spawn(move || {
-                        if let Err(err) = handle_socket(socket, conf, auth) {
+                        if let Err(err) = handle_socket(socket, conf) {
                             error!("connection handler exited: {}", err);
                         }
                     })
@@ -51,25 +44,17 @@ fn get_tid() -> u64 {
 
 /// This is run by `thread_main` above, inside a background thread.
 ///
-fn handle_socket(
-    socket: TcpStream,
-    conf: SafeKeeperConf,
-    auth: Option<Arc<JwtAuth>>,
-) -> Result<()> {
+fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
     let _enter = info_span!("", tid = ?get_tid()).entered();
 
     socket.set_nodelay(true)?;
 
-    let mut conn_handler = SafekeeperPostgresHandler::new(conf, auth.clone());
-    let pgbackend = PostgresBackend::new(
-        socket,
-        match auth {
-            None => AuthType::Trust,
-            Some(_) => AuthType::NeonJWT,
-        },
-        None,
-        false,
-    )?;
+    let auth_type = match conf.auth {
+        None => AuthType::Trust,
+        Some(_) => AuthType::NeonJWT,
+    };
+    let mut conn_handler = SafekeeperPostgresHandler::new(conf);
+    let pgbackend = PostgresBackend::new(socket, auth_type, None, false)?;
     // libpq replication protocol between safekeeper and replicas/pagers
     pgbackend.run(&mut conn_handler)?;
 

From 9f94d098aa7e843428085acc8cf80550ad35219a Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Fri, 16 Dec 2022 20:18:15 +0200
Subject: [PATCH 02/42] Remove unused AuthType::MD5

---
 libs/utils/src/postgres_backend.rs       | 26 ------------------------
 libs/utils/src/postgres_backend_async.rs | 23 ---------------------
 pageserver/src/bin/pageserver.rs         |  2 +-
 3 files changed, 1 insertion(+), 50 deletions(-)

diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
index 5b34c7adfb..bac6f861c3 100644
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -7,7 +7,6 @@ use crate::sock_split::{BidiStream, ReadStream, WriteStream};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
-use rand::Rng;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::io::{self, Write};
@@ -33,11 +32,6 @@ pub trait Handler {
         Ok(())
     }
 
-    /// Check auth md5
-    fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
-        bail!("MD5 auth failed")
-    }
-
     /// Check auth jwt
     fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
         bail!("JWT auth failed")
@@ -61,7 +55,6 @@ pub enum ProtoState {
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub enum AuthType {
     Trust,
-    MD5,
     // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
     NeonJWT,
 }
@@ -72,7 +65,6 @@ impl FromStr for AuthType {
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         match s {
             "Trust" => Ok(Self::Trust),
-            "MD5" => Ok(Self::MD5),
             "NeonJWT" => Ok(Self::NeonJWT),
             _ => bail!("invalid value \"{s}\" for auth type"),
         }
@@ -83,7 +75,6 @@ impl fmt::Display for AuthType {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.write_str(match self {
             AuthType::Trust => "Trust",
-            AuthType::MD5 => "MD5",
             AuthType::NeonJWT => "NeonJWT",
         })
     }
@@ -134,7 +125,6 @@ pub struct PostgresBackend {
 
     pub state: ProtoState,
 
-    md5_salt: [u8; 4],
     auth_type: AuthType,
 
     peer_addr: SocketAddr,
@@ -187,7 +177,6 @@ impl PostgresBackend {
             stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
             buf_out: BytesMut::with_capacity(10 * 1024),
             state: ProtoState::Initialization,
-            md5_salt: [0u8; 4],
             auth_type,
             tls_config,
             peer_addr,
@@ -367,13 +356,6 @@ impl PostgresBackend {
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
-                            AuthType::MD5 => {
-                                rand::thread_rng().fill(&mut self.md5_salt);
-                                self.write_message(&BeMessage::AuthenticationMD5Password(
-                                    self.md5_salt,
-                                ))?;
-                                self.state = ProtoState::Authentication;
-                            }
                             AuthType::NeonJWT => {
                                 self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
                                 self.state = ProtoState::Authentication;
@@ -393,14 +375,6 @@ impl PostgresBackend {
 
                 match self.auth_type {
                     AuthType::Trust => unreachable!(),
-                    AuthType::MD5 => {
-                        let (_, md5_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_md5(self, md5_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
-                        }
-                    }
                     AuthType::NeonJWT => {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index a22774c69e..dc93131b61 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -7,7 +7,6 @@ use crate::postgres_backend::AuthType;
 use anyhow::{bail, Context, Result};
 use bytes::{Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
-use rand::Rng;
 use std::future::Future;
 use std::net::SocketAddr;
 use std::pin::Pin;
@@ -35,11 +34,6 @@ pub trait Handler {
         Ok(())
     }
 
-    /// Check auth md5
-    fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
-        bail!("MD5 auth failed")
-    }
-
     /// Check auth jwt
     fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
         bail!("JWT auth failed")
@@ -125,7 +119,6 @@ pub struct PostgresBackend {
 
     pub state: ProtoState,
 
-    md5_salt: [u8; 4],
     auth_type: AuthType,
 
     peer_addr: SocketAddr,
@@ -160,7 +153,6 @@ impl PostgresBackend {
             stream: Stream::Unencrypted(BufReader::new(socket)),
             buf_out: BytesMut::with_capacity(10 * 1024),
             state: ProtoState::Initialization,
-            md5_salt: [0u8; 4],
             auth_type,
             tls_config,
             peer_addr,
@@ -337,13 +329,6 @@ impl PostgresBackend {
                                     .write_message(&BeMessage::ReadyForQuery)?;
                                 self.state = ProtoState::Established;
                             }
-                            AuthType::MD5 => {
-                                rand::thread_rng().fill(&mut self.md5_salt);
-                                self.write_message(&BeMessage::AuthenticationMD5Password(
-                                    self.md5_salt,
-                                ))?;
-                                self.state = ProtoState::Authentication;
-                            }
                             AuthType::NeonJWT => {
                                 self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
                                 self.state = ProtoState::Authentication;
@@ -364,14 +349,6 @@ impl PostgresBackend {
 
                 match self.auth_type {
                     AuthType::Trust => unreachable!(),
-                    AuthType::MD5 => {
-                        let (_, md5_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_md5(self, md5_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
-                        }
-                    }
                     AuthType::NeonJWT => {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index b3d9b0f809..a124bf85c2 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -255,7 +255,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
 
     // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
-        AuthType::Trust | AuthType::MD5 => None,
+        AuthType::Trust => None,
         AuthType::NeonJWT => {
             // unwrap is ok because check is performed when creating config, so path is set and file exists
             let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();

From 3468db8a2beed8977c29597af4c58286c6f9f0ff Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Jan 2023 08:47:28 +0100
Subject: [PATCH 03/42] Bump setuptools from 65.5.0 to 65.5.1 (#3212)

---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1b04230cef..edbcddd576 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1418,7 +1418,7 @@ pbr = "*"
 
 [[package]]
 name = "setuptools"
-version = "65.5.0"
+version = "65.5.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 category = "main"
 optional = false
@@ -1426,7 +1426,7 @@ python-versions = ">=3.7"
 
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
@@ -2283,8 +2283,8 @@ sarif-om = [
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
 setuptools = [
-    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
-    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+    {file = "setuptools-65.5.1-py3-none-any.whl", hash = "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31"},
+    {file = "setuptools-65.5.1.tar.gz", hash = "sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f"},
 ]
 six = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},

From 81afd7011c512db1114063ae568feba1af7c3125 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 31 Dec 2022 02:45:36 +0200
Subject: [PATCH 04/42] Use rustls for everything.

I looked at "cargo tree" output and noticed that through various
dependencies, we are depending on both native-tls and rustls. We have
tried to standardize on rustls for everything, but dependencies on
native-tls have crept in recently. One such dependency came from
'reqwest' with default features in pageserver, used for
consumption_metrics. Another dependency was from 'sentry'. Both
'reqwest' and 'sentry' use native-tls by default, but can use 'rustls'
if compiled with the right feature flags.
---
 Cargo.lock                | 119 ++------------------------------------
 libs/utils/Cargo.toml     |   2 +-
 pageserver/Cargo.toml     |   2 +-
 workspace_hack/Cargo.toml |   2 +-
 4 files changed, 8 insertions(+), 117 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2737a4d934..4daeef1f06 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1342,21 +1342,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -1757,19 +1742,6 @@ dependencies = [
  "tokio-io-timeout",
 ]
 
-[[package]]
-name = "hyper-tls"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
-dependencies = [
- "bytes",
- "hyper",
- "native-tls",
- "tokio",
- "tokio-native-tls",
-]
-
 [[package]]
 name = "iana-time-zone"
 version = "0.1.53"
@@ -2141,24 +2113,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
-[[package]]
-name = "native-tls"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
 [[package]]
 name = "nix"
 version = "0.23.2"
@@ -2305,51 +2259,12 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
-[[package]]
-name = "openssl"
-version = "0.10.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29d971fd5722fec23977260f6e81aa67d2f22cadbdc2aa049f1022d9a3be1566"
-dependencies = [
- "bitflags",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
-[[package]]
-name = "openssl-sys"
-version = "0.9.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5454462c0eced1e97f2ec09036abc8da362e66802f66fd20f86854d9d8cbcbc4"
-dependencies = [
- "autocfg",
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "os_info"
 version = "3.5.1"
@@ -2583,12 +2498,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
-[[package]]
-name = "pkg-config"
-version = "0.3.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
-
 [[package]]
 name = "plotters"
 version = "0.3.4"
@@ -3095,12 +3004,10 @@ dependencies = [
  "http-body",
  "hyper",
  "hyper-rustls",
- "hyper-tls",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -3110,7 +3017,6 @@ dependencies = [
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-native-tls",
  "tokio-rustls",
  "tower-service",
  "url",
@@ -3423,15 +3329,14 @@ version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
 dependencies = [
- "httpdate",
- "native-tls",
  "reqwest",
+ "rustls",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
  "sentry-panic",
- "tokio",
  "ureq",
+ "webpki-roots",
 ]
 
 [[package]]
@@ -4004,16 +3909,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "tokio-native-tls"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
-dependencies = [
- "native-tls",
- "tokio",
-]
-
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
@@ -4362,9 +4257,11 @@ dependencies = [
  "base64 0.13.1",
  "chunked_transfer",
  "log",
- "native-tls",
  "once_cell",
+ "rustls",
  "url",
+ "webpki",
+ "webpki-roots",
 ]
 
 [[package]]
@@ -4447,12 +4344,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
 
-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 47639e8205..9324a862b4 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-sentry = "0.29.0"
+sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls" ] }
 async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index f5acfcbdc0..cd12ee0cc9 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -69,7 +69,7 @@ storage_broker = { version = "0.1", path = "../storage_broker" }
 tenant_size_model = { path = "../libs/tenant_size_model" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
-reqwest = "0.11.13"
+reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 6c81756fe1..e36075921f 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,7 +37,7 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
+reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }

From 41b8e673052e55a77b1403f3eac087446c2dbc38 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 2 Jan 2023 14:50:51 +0400
Subject: [PATCH 05/42] Fix 81afd7011 by enabling reqwest feature for sentry.

It disabled transport altogether.
---
 Cargo.lock                | 3 ++-
 libs/utils/Cargo.toml     | 2 +-
 workspace_hack/Cargo.toml | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4daeef1f06..46170717d6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3329,12 +3329,14 @@ version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc"
 dependencies = [
+ "httpdate",
  "reqwest",
  "rustls",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
  "sentry-panic",
+ "tokio",
  "ureq",
  "webpki-roots",
 ]
@@ -4666,7 +4668,6 @@ dependencies = [
  "rand",
  "regex",
  "regex-syntax",
- "reqwest",
  "scopeguard",
  "serde",
  "serde_json",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 9324a862b4..670270b63e 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls" ] }
+sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index e36075921f..4c7fbd8333 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -37,7 +37,6 @@ prost = { version = "0.11", features = ["prost-derive", "std"] }
 rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
 regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
 regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
-reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] }
 scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }

From 56a4466d0a85a9498bfd2a78a4ad3a2facb58167 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 2 Jan 2023 14:34:06 +0200
Subject: [PATCH 06/42] Run Python tests in 8 threads (#3206)

I have experimented with the runner threads number, and looks like 8
threads win us a few seconds.

Bumping the thread count more did not improve the situation much:
* 20 threads were not allowed by pytest
* 16 threads were flacking quite notably

My guess would be that all pageservers, safekeepers, and other nodes we
start occupy quite much of the CPU and other resources to make this
approach more scalable.
---
 .github/actions/run-python-test-set/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 990c7e25a9..95167ecf6c 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n4 uses four processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+          # -n8 uses eight processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n8 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist

From 6fd64cd5f67fbc6cfc8286138004e900a28e4d3b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 2 Jan 2023 16:03:26 +0400
Subject: [PATCH 07/42] Allow failure to report metrics in
 test_metric_collection.

Per CI
https://github.com/neondatabase/neon/actions/runs/3822039946/attempts/1
shutdown seems to be racy.
---
 test_runner/regress/test_metric_collection.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index ac9f163801..0fff86f268 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -107,6 +107,9 @@ def test_metric_collection(
 
     # spin up neon,  after http server is ready
     env = neon_env_builder.init_start()
+    # Order of fixtures shutdown is not specified, and if http server gets down
+    # before pageserver, pageserver log might contain such errors in the end.
+    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
     env.neon_cli.create_branch("test_metric_collection")
     pg = env.postgres.create_start("test_metric_collection")
 

From a9cca7a0fd7c7585334ae9e5cdb3f13b20db324a Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 2 Jan 2023 16:51:05 +0200
Subject: [PATCH 08/42] Use proper error code for BeMessage error responses
 (#3240)

Based on
https://github.com/neondatabase/neon/pull/3227#discussion_r1059430067

Seems that the constant, used for internal error during BeMessage error
response serialization is incorrect.
Currently used one is `CXX000`, yet all docs mention `XX000` instead:

* https://www.postgresql.org/docs/current/errcodes-appendix.html
* https://docs.rs/postgres/latest/postgres/error/struct.SqlState.html#associatedconstant.INTERNAL_ERROR

I have checked it with the patch and logs described in
https://github.com/neondatabase/neon/pull/3227#discussion_r1059949982
---
 libs/pq_proto/src/lib.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 278f044c15..d31a2d51f2 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -626,6 +626,8 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
     Ok(result)
 }
 
+const SQLSTATE_INTERNAL_ERROR: &str = "XX000\0";
+
 impl<'a> BeMessage<'a> {
     /// Write message to the given buf.
     // Unlike the reading side, we use BytesMut
@@ -776,7 +778,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"ERROR\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(b"CXX000\0");
+                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg, buf)?;
@@ -799,7 +801,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"NOTICE\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(b"CXX000\0");
+                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg.as_bytes(), buf)?;

From 182dc785d6d5af3ef91e83f94688d71f9652175f Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 2 Jan 2023 18:05:23 +0200
Subject: [PATCH 09/42] Set PITR  default to 7 days (#3245)

https://github.com/neondatabase/cloud/issues/3406
---
 pageserver/src/tenant/config.rs         | 2 +-
 test_runner/regress/test_tenant_conf.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 8569c70217..c95a98fbc7 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -30,7 +30,7 @@ pub mod defaults {
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
     pub const DEFAULT_GC_PERIOD: &str = "100 s";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
+    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 6d621fbb77..29cdcb18ce 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -59,7 +59,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 100,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,  # 7 days
                 }.items()
             )
 
@@ -79,7 +79,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 30,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 
@@ -107,7 +107,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 80,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 
@@ -130,7 +130,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "gc_horizon": 67108864,
                     "gc_period": 80,
                     "image_creation_threshold": 3,
-                    "pitr_interval": 2592000,
+                    "pitr_interval": 604800,
                 }.items()
             )
 

From 4c4d3dc87a731734881ed7a88d535f89f02b046f Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 2 Jan 2023 22:14:05 +0400
Subject: [PATCH 10/42] Add new pageserver to us-east-2 staging (#3248)

---
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 11c7992444..1d1b8dbfa4 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -27,6 +27,8 @@ storage:
           ansible_host: i-0c3e70929edb5d691
         pageserver-1.us-east-2.aws.neon.build:
           ansible_host: i-0565a8b4008aa3f40
+        pageserver-2.us-east-2.aws.neon.build:
+          ansible_host: i-01e31cdf7e970586a
 
     safekeepers:
       hosts:

From 5bc9f8eae01deffc0d8cc95dfae482b922bb773e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Jan 2023 16:54:39 +0100
Subject: [PATCH 11/42] README: Fedora needs protobuf-devel

Otherwise, common protobufs such as Google's empty.proto are missing,
resulting in storage_broker build.rs failure.

I encountered this on Fedora 36.
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 30bde949a9..fa5c1626e4 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,8 @@ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
-  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler
+  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
+  protobuf-devel
 ```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)

From 0a0e55c3d08ce8b6956b9235888946d4ff97a4f7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 3 Jan 2023 12:39:11 +0200
Subject: [PATCH 12/42] Replace 'tar' crate with 'tokio-tar' (#3202)

The synchronous 'tar' crate has required us to use block_in_place and
SyncIoBridge to work together with the async I/O in the client
connection. Switch to 'tokio-tar' crate that uses async I/O natively.

As part of this, move the CopyDataWriter implementation to
postgres_backend_async.rs. Even though it's only used in one place
currently, it's in principle generally applicable whenever you want to
use COPY out.

Unfortunately we cannot use the 'tokio-tar' as it is: the Builder
implementation requires the writer to have 'static lifetime. So we
have to use a modified version without that requirement. The 'static
lifetime was required just for the Drop implementation that writes
the end-of-archive sections if the Builder is dropped without calling
`finish`. But we don't actually want that behavior anyway; in fact
we had to jump through some hoops with the AbortableWrite hack to skip
those. With the modified version of 'tokio-tar' without that Drop
implementation, we don't need AbortableWrite either.

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
---
 Cargo.lock                               |  16 +-
 libs/utils/src/postgres_backend_async.rs | 105 ++++++++++-
 pageserver/Cargo.toml                    |   2 +-
 pageserver/src/basebackup.rs             | 229 +++++++++--------------
 pageserver/src/import_datadir.rs         | 103 +++++-----
 pageserver/src/page_service.rs           |  52 +----
 pageserver/src/tenant.rs                 |  32 ++--
 test_runner/regress/test_config.py       |   0
 8 files changed, 278 insertions(+), 261 deletions(-)
 mode change 100644 => 100755 test_runner/regress/test_config.py

diff --git a/Cargo.lock b/Cargo.lock
index 46170717d6..ad1fc67219 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2339,12 +2339,12 @@ dependencies = [
  "signal-hook",
  "storage_broker",
  "svg_fmt",
- "tar",
  "tempfile",
  "tenant_size_model",
  "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-tar",
  "tokio-util",
  "toml_edit",
  "tracing",
@@ -3970,6 +3970,20 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tokio-tar"
+version = "0.3.0"
+source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
+dependencies = [
+ "filetime",
+ "futures-core",
+ "libc",
+ "redox_syscall",
+ "tokio",
+ "tokio-stream",
+ "xattr",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.4"
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index dc93131b61..de547c3242 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -5,7 +5,7 @@
 
 use crate::postgres_backend::AuthType;
 use anyhow::{bail, Context, Result};
-use bytes::{Bytes, BytesMut};
+use bytes::{Buf, Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use std::future::Future;
 use std::net::SocketAddr;
@@ -114,7 +114,10 @@ impl AsyncRead for Stream {
 
 pub struct PostgresBackend {
     stream: Stream,
+
     // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
+    // The data between 0 and "current position" as tracked by the bytes::Buf
+    // implementation of BytesMut, have already been written.
     buf_out: BytesMut,
 
     pub state: ProtoState,
@@ -174,10 +177,13 @@ impl PostgresBackend {
     }
 
     /// Flush output buffer into the socket.
-    pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
-        self.stream.write_all(&self.buf_out).await?;
+    pub async fn flush(&mut self) -> std::io::Result<()> {
+        while self.buf_out.has_remaining() {
+            let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
+            self.buf_out.advance(bytes_written);
+        }
         self.buf_out.clear();
-        Ok(self)
+        Ok(())
     }
 
     /// Write message into internal output buffer.
@@ -186,6 +192,36 @@ impl PostgresBackend {
         Ok(self)
     }
 
+    /// Returns an AsyncWrite implementation that wraps all the data written
+    /// to it in CopyData messages, and writes them to the connection
+    ///
+    /// The caller is responsible for sending CopyOutResponse and CopyDone messages.
+    pub fn copyout_writer(&mut self) -> CopyDataWriter {
+        CopyDataWriter { pgb: self }
+    }
+
+    /// A polling function that tries to write all the data from 'buf_out' to the
+    /// underlying stream.
+    fn poll_write_buf(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        while self.buf_out.has_remaining() {
+            match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
+                Poll::Ready(Ok(bytes_written)) => {
+                    self.buf_out.advance(bytes_written);
+                }
+                Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                Poll::Pending => return Poll::Pending,
+            }
+        }
+        Poll::Ready(Ok(()))
+    }
+
+    fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
+        Pin::new(&mut self.stream).poll_flush(cx)
+    }
+
     // Wrapper for run_message_loop() that shuts down socket when we are done
     pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
     where
@@ -458,3 +494,64 @@ impl PostgresBackend {
         Ok(ProcessMsgResult::Continue)
     }
 }
+
+///
+/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
+/// messages.
+///
+
+pub struct CopyDataWriter<'a> {
+    pgb: &'a mut PostgresBackend,
+}
+
+impl<'a> AsyncWrite for CopyDataWriter<'a> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, std::io::Error>> {
+        let this = self.get_mut();
+
+        // It's not strictly required to flush between each message, but makes it easier
+        // to view in wireshark, and usually the messages that the callers write are
+        // decently-sized anyway.
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+
+        // CopyData
+        // XXX: if the input is large, we should split it into multiple messages.
+        // Not sure what the threshold should be, but the ultimate hard limit is that
+        // the length cannot exceed u32.
+        this.pgb.write_message(&BeMessage::CopyData(buf))?;
+
+        Poll::Ready(Ok(buf.len()))
+    }
+
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+        this.pgb.poll_flush(cx)
+    }
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        match this.pgb.poll_write_buf(cx) {
+            Poll::Ready(Ok(())) => {}
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        }
+        this.pgb.poll_flush(cx)
+    }
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index cd12ee0cc9..c0f3c76c4e 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -49,7 +49,7 @@ serde_json = { version = "1.0", features = ["raw_value"] }
 serde_with = "2.0"
 signal-hook = "0.3.10"
 svg_fmt = "0.4.1"
-tar = "0.4.33"
+tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 36664e119e..e537048489 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,17 +13,22 @@
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
-use itertools::Itertools;
 use std::fmt::Write as FmtWrite;
-use std::io;
-use std::io::Write;
 use std::sync::Arc;
 use std::time::SystemTime;
-use tar::{Builder, EntryType, Header};
+use tokio::io;
+use tokio::io::AsyncWrite;
 use tracing::*;
 
-use crate::task_mgr;
-use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
+/// NB: This relies on a modified version of tokio_tar that does *not* write the
+/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
+/// without explicitly calling 'finish' or 'into_inner'!
+///
+/// See https://github.com/neondatabase/tokio-tar/pull/1
+///
+use tokio_tar::{Builder, EntryType, Header};
+
+use crate::tenant::{with_ondemand_download, Timeline};
 use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -39,14 +44,13 @@ use utils::lsn::Lsn;
 /// used for constructing tarball.
 pub struct Basebackup<'a, W>
 where
-    W: Write,
+    W: AsyncWrite + Send + Sync + Unpin,
 {
-    ar: Builder<AbortableWrite<W>>,
+    ar: Builder<&'a mut W>,
     timeline: &'a Arc<Timeline>,
     pub lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
-    finished: bool,
 }
 
 // Create basebackup with non-rel data in it.
@@ -59,10 +63,10 @@ where
 //    to start the replication.
 impl<'a, W> Basebackup<'a, W>
 where
-    W: Write,
+    W: AsyncWrite + Send + Sync + Unpin,
 {
     pub fn new(
-        write: W,
+        write: &'a mut W,
         timeline: &'a Arc<Timeline>,
         req_lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
@@ -117,22 +121,21 @@ where
         );
 
         Ok(Basebackup {
-            ar: Builder::new(AbortableWrite::new(write)),
+            ar: Builder::new_non_terminated(write),
             timeline,
             lsn: backup_lsn,
             prev_record_lsn: prev_lsn,
             full_backup,
-            finished: false,
         })
     }
 
-    pub fn send_tarball(mut self) -> anyhow::Result<()> {
+    pub async fn send_tarball(mut self) -> anyhow::Result<()> {
         // TODO include checksum
 
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
             let header = new_tar_header_dir(dir)?;
-            self.ar.append(&header, &mut io::empty())?;
+            self.ar.append(&header, &mut io::empty()).await?;
         }
 
         // Send empty config files.
@@ -140,10 +143,10 @@ where
             if *filepath == "pg_hba.conf" {
                 let data = PG_HBA.as_bytes();
                 let header = new_tar_header(filepath, data.len() as u64)?;
-                self.ar.append(&header, data)?;
+                self.ar.append(&header, data).await?;
             } else {
                 let header = new_tar_header(filepath, 0)?;
-                self.ar.append(&header, &mut io::empty())?;
+                self.ar.append(&header, &mut io::empty()).await?;
             }
         }
 
@@ -154,29 +157,30 @@ where
             SlruKind::MultiXactMembers,
         ] {
             for segno in
-                with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
+                with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await?
             {
-                self.add_slru_segment(kind, segno)?;
+                self.add_slru_segment(kind, segno).await?;
             }
         }
 
         // Create tablespace directories
         for ((spcnode, dbnode), has_relmap_file) in
-            with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
+            with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await?
         {
-            self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
+            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in with_ondemand_download_sync(|| {
-                    self.timeline.list_rels(spcnode, dbnode, self.lsn)
-                })? {
-                    self.add_rel(rel)?;
+                for rel in
+                    with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
+                        .await?
+                {
+                    self.add_rel(rel).await?;
                 }
             }
         }
-        for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
-            self.add_twophase_file(xid)?;
+        for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? {
+            self.add_twophase_file(xid).await?;
         }
 
         fail_point!("basebackup-before-control-file", |_| {
@@ -184,36 +188,32 @@ where
         });
 
         // Generate pg_control and bootstrap WAL segment.
-        self.add_pgcontrol_file()?;
-        self.ar.finish()?;
-        self.finished = true;
+        self.add_pgcontrol_file().await?;
+        self.ar.finish().await?;
         debug!("all tarred up!");
         Ok(())
     }
 
-    fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
+    async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
         let nblocks =
-            with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
-
-        // Function that adds relation segment data to archive
-        let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
-            let file_name = tag.to_segfile_name(segment_index as u32);
-            let header = new_tar_header(&file_name, data.len() as u64)?;
-            self.ar.append(&header, data.as_slice())?;
-            Ok(())
-        };
+            with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
-            add_file(0, &vec![])?;
+            let file_name = tag.to_segfile_name(0);
+            let header = new_tar_header(&file_name, 0)?;
+            self.ar.append(&header, &mut io::empty()).await?;
             return Ok(());
         }
 
         // Add a file for each chunk of blocks (aka segment)
-        let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize);
-        for (seg, blocks) in chunks.into_iter().enumerate() {
+        let mut startblk = 0;
+        let mut seg = 0;
+        while startblk < nblocks {
+            let endblk = std::cmp::min(startblk + RELSEG_SIZE, nblocks);
+
             let mut segment_data: Vec<u8> = vec![];
-            for blknum in blocks {
+            for blknum in startblk..endblk {
                 let img = self
                     .timeline
                     .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
@@ -221,7 +221,12 @@ where
                 segment_data.extend_from_slice(&img[..]);
             }
 
-            add_file(seg, &segment_data)?;
+            let file_name = tag.to_segfile_name(seg as u32);
+            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
+            self.ar.append(&header, segment_data.as_slice()).await?;
+
+            seg += 1;
+            startblk = endblk;
         }
 
         Ok(())
@@ -230,17 +235,18 @@ where
     //
     // Generate SLRU segment files from repository.
     //
-    fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = with_ondemand_download_sync(|| {
-            self.timeline.get_slru_segment_size(slru, segno, self.lsn)
-        })?;
+    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
+        let nblocks =
+            with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn))
+                .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
-            let img = with_ondemand_download_sync(|| {
+            let img = with_ondemand_download(|| {
                 self.timeline
                     .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
-            })?;
+            })
+            .await?;
 
             if slru == SlruKind::Clog {
                 ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -253,7 +259,7 @@ where
 
         let segname = format!("{}/{:>04X}", slru.to_str(), segno);
         let header = new_tar_header(&segname, slru_buf.len() as u64)?;
-        self.ar.append(&header, slru_buf.as_slice())?;
+        self.ar.append(&header, slru_buf.as_slice()).await?;
 
         trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
         Ok(())
@@ -265,16 +271,16 @@ where
     // Each directory contains a PG_VERSION file, and the default database
     // directories also contain pg_filenode.map files.
     //
-    fn add_dbdir(
+    async fn add_dbdir(
         &mut self,
         spcnode: u32,
         dbnode: u32,
         has_relmap_file: bool,
     ) -> anyhow::Result<()> {
         let relmap_img = if has_relmap_file {
-            let img = with_ondemand_download_sync(|| {
-                self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
-            })?;
+            let img =
+                with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn))
+                    .await?;
             ensure!(img.len() == 512);
             Some(img)
         } else {
@@ -284,14 +290,14 @@ where
         if spcnode == GLOBALTABLESPACE_OID {
             let pg_version_str = self.timeline.pg_version.to_string();
             let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes())?;
+            self.ar.append(&header, pg_version_str.as_bytes()).await?;
 
             info!("timeline.pg_version {}", self.timeline.pg_version);
 
             if let Some(img) = relmap_img {
                 // filenode map for global tablespace
                 let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..])?;
+                self.ar.append(&header, &img[..]).await?;
             } else {
                 warn!("global/pg_filenode.map is missing");
             }
@@ -321,18 +327,18 @@ where
             // Append dir path for each database
             let path = format!("base/{}", dbnode);
             let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty())?;
+            self.ar.append(&header, &mut io::empty()).await?;
 
             if let Some(img) = relmap_img {
                 let dst_path = format!("base/{}/PG_VERSION", dbnode);
 
                 let pg_version_str = self.timeline.pg_version.to_string();
                 let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes())?;
+                self.ar.append(&header, pg_version_str.as_bytes()).await?;
 
                 let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                 let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..])?;
+                self.ar.append(&header, &img[..]).await?;
             }
         };
         Ok(())
@@ -341,8 +347,8 @@ where
     //
     // Extract twophase state files
     //
-    fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+        let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -350,7 +356,7 @@ where
         buf.put_u32_le(crc);
         let path = format!("pg_twophase/{:>08X}", xid);
         let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..])?;
+        self.ar.append(&header, &buf[..]).await?;
 
         Ok(())
     }
@@ -359,7 +365,7 @@ where
     // Add generated pg_control file and bootstrap WAL segment.
     // Also send zenith.signal file with extra bootstrap data.
     //
-    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
@@ -371,17 +377,19 @@ where
         } else {
             write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
         }
-        self.ar.append(
-            &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-            zenith_signal.as_bytes(),
-        )?;
+        self.ar
+            .append(
+                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
+                zenith_signal.as_bytes(),
+            )
+            .await?;
 
-        let checkpoint_bytes =
-            with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
-                .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes =
-            with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
-                .context("failed get control bytes")?;
+        let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn))
+            .await
+            .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn))
+            .await
+            .context("failed get control bytes")?;
 
         let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
             &pg_control_bytes,
@@ -392,7 +400,7 @@ where
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..])?;
+        self.ar.append(&header, &pg_control_bytes[..]).await?;
 
         //send wal segment
         let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -404,24 +412,11 @@ where
             postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
                 .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
         ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
-        self.ar.append(&header, &wal_seg[..])?;
+        self.ar.append(&header, &wal_seg[..]).await?;
         Ok(())
     }
 }
 
-impl<'a, W> Drop for Basebackup<'a, W>
-where
-    W: Write,
-{
-    /// If the basebackup was not finished, prevent the Archive::drop() from
-    /// writing the end-of-archive marker.
-    fn drop(&mut self) {
-        if !self.finished {
-            self.ar.get_mut().abort();
-        }
-    }
-}
-
 //
 // Create new tarball entry header
 //
@@ -457,57 +452,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
     header.set_cksum();
     Ok(header)
 }
-
-/// A wrapper that passes through all data to the underlying Write,
-/// until abort() is called.
-///
-/// tar::Builder has an annoying habit of finishing the archive with
-/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
-/// even if an error occurs and we don't finish building the archive.
-/// We'd rather abort writing the tarball immediately than construct
-/// a seemingly valid but incomplete archive. This wrapper allows us
-/// to swallow the end-of-archive marker that Builder::drop() emits,
-/// without writing it to the underlying sink.
-///
-struct AbortableWrite<W> {
-    w: W,
-    aborted: bool,
-}
-
-impl<W> AbortableWrite<W> {
-    pub fn new(w: W) -> Self {
-        AbortableWrite { w, aborted: false }
-    }
-
-    pub fn abort(&mut self) {
-        self.aborted = true;
-    }
-}
-
-impl<W> Write for AbortableWrite<W>
-where
-    W: Write,
-{
-    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
-        if self.aborted {
-            Ok(data.len())
-        } else {
-            self.w.write(data)
-        }
-    }
-    fn flush(&mut self) -> io::Result<()> {
-        if self.aborted {
-            Ok(())
-        } else {
-            self.w.flush()
-        }
-    }
-}
-
-fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
-where
-    F: Send + Fn() -> PageReconstructResult<T>,
-    T: Send,
-{
-    task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
-}
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 588b92c13f..bac27f69de 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,12 +2,13 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
-use std::fs::File;
-use std::io::{Read, Seek, SeekFrom};
 use std::path::{Path, PathBuf};
 
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
+use futures::StreamExt;
+use tokio::io::{AsyncRead, AsyncReadExt};
+use tokio_tar::Archive;
 use tracing::*;
 use walkdir::WalkDir;
 
@@ -42,7 +43,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// This is currently only used to import a cluster freshly created by initdb.
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
-pub fn import_timeline_from_postgres_datadir(
+pub async fn import_timeline_from_postgres_datadir(
     tline: &Timeline,
     pgdata_path: &Path,
     pgdata_lsn: Lsn,
@@ -65,9 +66,11 @@ pub fn import_timeline_from_postgres_datadir(
             let absolute_path = entry.path();
             let relative_path = absolute_path.strip_prefix(pgdata_path)?;
 
-            let file = File::open(absolute_path)?;
+            let mut file = tokio::fs::File::open(absolute_path).await?;
             let len = metadata.len() as usize;
-            if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
+            if let Some(control_file) =
+                import_file(&mut modification, relative_path, &mut file, len).await?
+            {
                 pg_control = Some(control_file);
             }
             modification.flush()?;
@@ -102,12 +105,12 @@ pub fn import_timeline_from_postgres_datadir(
 }
 
 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
-fn import_rel<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_rel(
+    modification: &mut DatadirModification<'_>,
     path: &Path,
     spcoid: Oid,
     dboid: Oid,
-    mut reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> anyhow::Result<()> {
     // Does it look like a relation file?
@@ -148,7 +151,7 @@ fn import_rel<Reader: Read>(
     }
 
     loop {
-        let r = reader.read_exact(&mut buf);
+        let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
                 modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
@@ -181,11 +184,11 @@ fn import_rel<Reader: Read>(
 
 /// Import an SLRU segment file
 ///
-fn import_slru<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_slru(
+    modification: &mut DatadirModification<'_>,
     slru: SlruKind,
     path: &Path,
-    mut reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> anyhow::Result<()> {
     info!("importing slru file {path:?}");
@@ -206,7 +209,7 @@ fn import_slru<Reader: Read>(
 
     let mut rpageno = 0;
     loop {
-        let r = reader.read_exact(&mut buf);
+        let r = reader.read_exact(&mut buf).await;
         match r {
             Ok(_) => {
                 modification.put_slru_page_image(
@@ -243,6 +246,7 @@ fn import_wal(
     startpoint: Lsn,
     endpoint: Lsn,
 ) -> anyhow::Result<()> {
+    use std::io::Read;
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
@@ -265,10 +269,11 @@ fn import_wal(
         }
 
         // Slurp the WAL file
-        let mut file = File::open(&path)?;
+        let mut file = std::fs::File::open(&path)?;
 
         if offset > 0 {
-            file.seek(SeekFrom::Start(offset as u64))?;
+            use std::io::Seek;
+            file.seek(std::io::SeekFrom::Start(offset as u64))?;
         }
 
         let nread = file.read_to_end(&mut buf)?;
@@ -310,9 +315,9 @@ fn import_wal(
     Ok(())
 }
 
-pub fn import_basebackup_from_tar<Reader: Read>(
+pub async fn import_basebackup_from_tar(
     tline: &Timeline,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     base_lsn: Lsn,
 ) -> Result<()> {
     info!("importing base at {base_lsn}");
@@ -322,21 +327,24 @@ pub fn import_basebackup_from_tar<Reader: Read>(
     let mut pg_control: Option<ControlFileData> = None;
 
     // Import base
-    for base_tar_entry in tar::Archive::new(reader).entries()? {
-        let entry = base_tar_entry?;
+    let mut entries = Archive::new(reader).entries()?;
+    while let Some(base_tar_entry) = entries.next().await {
+        let mut entry = base_tar_entry?;
         let header = entry.header();
         let len = header.entry_size()? as usize;
         let file_path = header.path()?.into_owned();
 
         match header.entry_type() {
-            tar::EntryType::Regular => {
-                if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
+            tokio_tar::EntryType::Regular => {
+                if let Some(res) =
+                    import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+                {
                     // We found the pg_control file.
                     pg_control = Some(res);
                 }
                 modification.flush()?;
             }
-            tar::EntryType::Directory => {
+            tokio_tar::EntryType::Directory => {
                 debug!("directory {:?}", file_path);
             }
             _ => {
@@ -356,9 +364,9 @@ pub fn import_basebackup_from_tar<Reader: Read>(
     Ok(())
 }
 
-pub fn import_wal_from_tar<Reader: Read>(
+pub async fn import_wal_from_tar(
     tline: &Timeline,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     start_lsn: Lsn,
     end_lsn: Lsn,
 ) -> Result<()> {
@@ -371,16 +379,19 @@ pub fn import_wal_from_tar<Reader: Read>(
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
-    let mut pg_wal_tar = tar::Archive::new(reader);
-    let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
+    let mut pg_wal_tar = Archive::new(reader);
+    let mut pg_wal_entries = pg_wal_tar.entries()?;
     while last_lsn <= end_lsn {
         let bytes = {
-            let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
+            let mut entry = pg_wal_entries
+                .next()
+                .await
+                .ok_or_else(|| anyhow::anyhow!("expected more wal"))??;
             let header = entry.header();
             let file_path = header.path()?.into_owned();
 
             match header.entry_type() {
-                tar::EntryType::Regular => {
+                tokio_tar::EntryType::Regular => {
                     // FIXME: assume postgresql tli 1 for now
                     let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
                     let file_name = file_path
@@ -390,9 +401,9 @@ pub fn import_wal_from_tar<Reader: Read>(
                     ensure!(expected_filename == file_name);
 
                     debug!("processing wal file {:?}", file_path);
-                    read_all_bytes(entry)?
+                    read_all_bytes(&mut entry).await?
                 }
-                tar::EntryType::Directory => {
+                tokio_tar::EntryType::Directory => {
                     debug!("directory {:?}", file_path);
                     continue;
                 }
@@ -433,7 +444,7 @@ pub fn import_wal_from_tar<Reader: Read>(
     }
 
     // Log any extra unused files
-    for e in &mut pg_wal_entries_iter {
+    while let Some(e) = pg_wal_entries.next().await {
         let entry = e?;
         let header = entry.header();
         let file_path = header.path()?.into_owned();
@@ -443,10 +454,10 @@ pub fn import_wal_from_tar<Reader: Read>(
     Ok(())
 }
 
-fn import_file<Reader: Read>(
-    modification: &mut DatadirModification,
+async fn import_file(
+    modification: &mut DatadirModification<'_>,
     file_path: &Path,
-    reader: Reader,
+    reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
 ) -> Result<Option<ControlFileData>> {
     let file_name = match file_path.file_name() {
@@ -466,7 +477,7 @@ fn import_file<Reader: Read>(
 
         match file_name.as_ref() {
             "pg_control" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
 
                 // Extract the checkpoint record and import it separately.
                 let pg_control = ControlFileData::decode(&bytes[..])?;
@@ -479,7 +490,7 @@ fn import_file<Reader: Read>(
                 return Ok(Some(pg_control));
             }
             "pg_filenode.map" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
                 debug!("imported relmap file")
             }
@@ -487,7 +498,7 @@ fn import_file<Reader: Read>(
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
                 debug!("imported rel creation");
             }
         }
@@ -502,7 +513,7 @@ fn import_file<Reader: Read>(
 
         match file_name.as_ref() {
             "pg_filenode.map" => {
-                let bytes = read_all_bytes(reader)?;
+                let bytes = read_all_bytes(reader).await?;
                 modification.put_relmap_file(spcnode, dbnode, bytes)?;
                 debug!("imported relmap file")
             }
@@ -510,36 +521,36 @@ fn import_file<Reader: Read>(
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
                 debug!("imported rel creation");
             }
         }
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported clog slru");
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported multixact offsets slru");
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len)?;
+        import_slru(modification, slru, file_path, reader, len).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
         let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
-        let bytes = read_all_bytes(reader)?;
+        let bytes = read_all_bytes(reader).await?;
         modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
         debug!("found wal file in base section. ignore it");
     } else if file_path.starts_with("zenith.signal") {
         // Parse zenith signal file to set correct previous LSN
-        let bytes = read_all_bytes(reader)?;
+        let bytes = read_all_bytes(reader).await?;
         // zenith.signal format is "PREV LSN: prev_lsn"
         // TODO write serialization and deserialization in the same place.
         let zenith_signal = std::str::from_utf8(&bytes)?.trim();
@@ -576,8 +587,8 @@ fn import_file<Reader: Read>(
     Ok(None)
 }
 
-fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
+async fn read_all_bytes(reader: &mut (impl AsyncRead + Send + Sync + Unpin)) -> Result<Bytes> {
     let mut buf: Vec<u8> = vec![];
-    reader.read_to_end(&mut buf)?;
+    reader.read_to_end(&mut buf).await?;
     Ok(Bytes::copy_from_slice(&buf[..]))
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b84b2694f4..5393fca780 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,9 +26,6 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::pin;
-use tokio_util::io::StreamReader;
-use tokio_util::io::SyncIoBridge;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
@@ -395,9 +392,7 @@ impl PageServerHandler {
         pgb.write_message(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
 
-        let copyin_stream = copyin_stream(pgb);
-        pin!(copyin_stream);
-
+        let mut copyin_stream = Box::pin(copyin_stream(pgb));
         timeline
             .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
             .await?;
@@ -443,8 +438,8 @@ impl PageServerHandler {
         pgb.write_message(&BeMessage::CopyInResponse)?;
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-        tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
+        let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
+        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -649,16 +644,14 @@ impl PageServerHandler {
         pgb.flush().await?;
 
         /* Send a tarball of the latest layer on the timeline */
-        let mut writer = CopyDataSink {
-            pgb,
-            rt: tokio::runtime::Handle::current(),
-        };
-        tokio::task::block_in_place(|| {
+        {
+            let mut writer = pgb.copyout_writer();
             let basebackup =
                 basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
             tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
-            basebackup.send_tarball()
-        })?;
+            basebackup.send_tarball().await?;
+        }
+
         pgb.write_message(&BeMessage::CopyDone)?;
         pgb.flush().await?;
         info!("basebackup complete");
@@ -966,32 +959,3 @@ async fn get_active_timeline_with_timeout(
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
 }
-
-///
-/// A std::io::Write implementation that wraps all data written to it in CopyData
-/// messages.
-///
-struct CopyDataSink<'a> {
-    pgb: &'a mut PostgresBackend,
-    rt: tokio::runtime::Handle,
-}
-
-impl<'a> io::Write for CopyDataSink<'a> {
-    fn write(&mut self, data: &[u8]) -> io::Result<usize> {
-        // CopyData
-        // FIXME: if the input is large, we should split it into multiple messages.
-        // Not sure what the threshold should be, but the ultimate hard limit is that
-        // the length cannot exceed u32.
-        // FIXME: flush isn't really required, but makes it easier
-        // to view in wireshark
-        self.pgb.write_message(&BeMessage::CopyData(data))?;
-        self.rt.block_on(self.pgb.flush())?;
-        trace!("CopyData sent for {} bytes!", data.len());
-
-        Ok(data.len())
-    }
-    fn flush(&mut self) -> io::Result<()> {
-        // no-op
-        Ok(())
-    }
-}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4c93490177..dcaa8ea268 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,8 +18,6 @@ use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use tokio::sync::watch;
-use tokio_util::io::StreamReader;
-use tokio_util::io::SyncIoBridge;
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 
@@ -36,7 +34,6 @@ use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
-use std::pin::Pin;
 use std::process::Command;
 use std::process::Stdio;
 use std::sync::Arc;
@@ -236,21 +233,15 @@ impl UninitializedTimeline<'_> {
     /// Prepares timeline data by loading it from the basebackup archive.
     pub async fn import_basebackup_from_tar(
         self,
-        mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
+        copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
         base_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
-        // import_basebackup_from_tar() is not async, mainly because the Tar crate
-        // it uses is not async. So we need to jump through some hoops:
-        // - convert the input from client connection to a synchronous Read
-        // - use block_in_place()
-        let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
-
-        tokio::task::block_in_place(|| {
-            import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
-                .context("Failed to import basebackup")
-        })?;
+        let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
+        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+            .await
+            .context("Failed to import basebackup")?;
 
         // Flush loop needs to be spawned in order to be able to flush.
         // We want to run proper checkpoint before we mark timeline as available to outside world
@@ -2139,13 +2130,12 @@ impl Tenant {
         let tenant_id = raw_timeline.owning_tenant.tenant_id;
         let unfinished_timeline = raw_timeline.raw_timeline()?;
 
-        tokio::task::block_in_place(|| {
-            import_datadir::import_timeline_from_postgres_datadir(
-                unfinished_timeline,
-                pgdata_path,
-                pgdata_lsn,
-            )
-        })
+        import_datadir::import_timeline_from_postgres_datadir(
+            unfinished_timeline,
+            pgdata_path,
+            pgdata_lsn,
+        )
+        .await
         .with_context(|| {
             format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
         })?;
diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py
old mode 100644
new mode 100755

From 8b692e131bdb5010a784032cc5e399f15d256bd6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 3 Jan 2023 14:44:42 +0200
Subject: [PATCH 13/42] Enable on-demand download in WalIngest. (#3233)

Makes the top-level functions in WalIngest async, and replaces
no_ondemand_download calls with with_ondemand_download.

This hopefully fixes the problem reported in issue #3230, although I
don't have a self-contained test case for it.
---
 pageserver/src/basebackup.rs                  |  15 +-
 pageserver/src/import_datadir.rs              |  13 +-
 pageserver/src/walingest.rs                   | 332 +++++++++---------
 .../src/walreceiver/walreceiver_connection.rs |  21 +-
 4 files changed, 187 insertions(+), 194 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index e537048489..4052f13875 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -214,10 +214,11 @@ where
 
             let mut segment_data: Vec<u8> = vec![];
             for blknum in startblk..endblk {
-                let img = self
-                    .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
-                    .no_ondemand_download()?;
+                let img = with_ondemand_download(|| {
+                    self.timeline
+                        .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                })
+                .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
@@ -313,10 +314,8 @@ where
             // XLOG_TBLSPC_DROP records. But we probably should just
             // throw an error on CREATE TABLESPACE in the first place.
             if !has_relmap_file
-                && self
-                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)
-                    .no_ondemand_download()?
+                && with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
+                    .await?
                     .is_empty()
             {
                 return Ok(());
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index bac27f69de..ca1514dd00 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -99,7 +99,8 @@ pub async fn import_timeline_from_postgres_datadir(
         tline,
         Lsn(pg_control.checkPointCopy.redo),
         pgdata_lsn,
-    )?;
+    )
+    .await?;
 
     Ok(())
 }
@@ -240,7 +241,7 @@ async fn import_slru(
 
 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(
+async fn import_wal(
     walpath: &Path,
     tline: &Timeline,
     startpoint: Lsn,
@@ -253,7 +254,7 @@ fn import_wal(
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
+    let mut walingest = WalIngest::new(tline, startpoint).await?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -291,7 +292,7 @@ fn import_wal(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded)
-                    .no_ondemand_download()?;
+                    .await?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -375,7 +376,7 @@ pub async fn import_wal_from_tar(
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
+    let mut walingest = WalIngest::new(tline, start_lsn).await?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -425,7 +426,7 @@ pub async fn import_wal_from_tar(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded)
-                    .no_ondemand_download()?;
+                    .await?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 031b80a6e0..1c974f7e2a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,6 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
-use anyhow::Context;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -31,12 +30,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::pgdatadir_mapping::*;
-use crate::tenant::PageReconstructResult;
 use crate::tenant::Timeline;
-use crate::try_page_reconstruct_result as try_prr;
+use crate::tenant::{with_ondemand_download, PageReconstructError};
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
-use crate::{try_no_ondemand_download, try_page_reconstruct_result};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -55,16 +52,15 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub fn new(timeline: &Timeline, startpoint: Lsn) -> PageReconstructResult<WalIngest> {
+    pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = try_no_ondemand_download!(timeline.get_checkpoint(startpoint));
-        let checkpoint = try_page_reconstruct_result!(
-            CheckPoint::decode(&checkpoint_bytes).context("Failed to decode checkpoint bytes")
-        );
+        let checkpoint_bytes =
+            with_ondemand_download(|| timeline.get_checkpoint(startpoint)).await?;
+        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
-        PageReconstructResult::Success(WalIngest {
+        Ok(WalIngest {
             timeline,
             checkpoint,
             checkpoint_modified: false,
@@ -79,18 +75,15 @@ impl<'a> WalIngest<'a> {
     /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
     /// relations/pages that the record affects.
     ///
-    pub fn ingest_record(
+    pub async fn ingest_record(
         &mut self,
         recdata: Bytes,
         lsn: Lsn,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
-    ) -> PageReconstructResult<()> {
+    ) -> anyhow::Result<()> {
         modification.lsn = lsn;
-        try_prr!(
-            decode_wal_record(recdata, decoded, self.timeline.pg_version)
-                .context("failed decoding wal record")
-        );
+        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -105,7 +98,8 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded));
+            self.ingest_heapam_record(&mut buf, modification, decoded)
+                .await?;
         }
         // Handle other special record types
         if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -113,13 +107,14 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            try_prr!(self.ingest_xlog_smgr_create(modification, &create));
+            self.ingest_xlog_smgr_create(modification, &create)?;
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate));
+            self.ingest_xlog_smgr_truncate(modification, &truncate)
+                .await?;
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
                 "handle RM_DBASE_ID for Postgres version {:?}",
@@ -132,14 +127,15 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
+                    self.ingest_xlog_dbase_create(modification, &createdb)
+                        .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
+                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -155,14 +151,15 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    try_prr!(self.ingest_xlog_dbase_create(modification, &createdb));
+                    self.ingest_xlog_dbase_create(modification, &createdb)
+                        .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
                 {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id));
+                        modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
                     }
                 }
             }
@@ -174,38 +171,42 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                try_prr!(self.put_slru_page_image(
+                self.put_slru_page_image(
                     modification,
                     SlruKind::Clog,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                ));
+                )
+                .await?;
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                try_prr!(self.ingest_clog_truncate_record(modification, &xlrec));
+                self.ingest_clog_truncate_record(modification, &xlrec)
+                    .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
             let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
             if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                try_prr!(self.ingest_xact_record(
+                self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
-                ));
+                )
+                .await?;
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                 || info == pg_constants::XLOG_XACT_ABORT_PREPARED
             {
                 let parsed_xact =
                     XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                try_prr!(self.ingest_xact_record(
+                self.ingest_xact_record(
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                ));
+                )
+                .await?;
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
                 trace!(
                     "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
@@ -213,10 +214,9 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                try_prr!(modification.drop_twophase_file(parsed_xact.xid));
+                modification.drop_twophase_file(parsed_xact.xid)?;
             } else if info == pg_constants::XLOG_XACT_PREPARE {
-                try_prr!(modification
-                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..])));
+                modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -225,34 +225,36 @@ impl<'a> WalIngest<'a> {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                try_prr!(self.put_slru_page_image(
+                self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactOffsets,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                ));
+                )
+                .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
                 let pageno = buf.get_u32_le();
                 let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                 let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                try_prr!(self.put_slru_page_image(
+                self.put_slru_page_image(
                     modification,
                     SlruKind::MultiXactMembers,
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
-                ));
+                )
+                .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                 let xlrec = XlMultiXactCreate::decode(&mut buf);
-                try_prr!(self.ingest_multixact_create_record(modification, &xlrec));
+                self.ingest_multixact_create_record(modification, &xlrec)?;
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec));
+                self.ingest_multixact_truncate_record(modification, &xlrec)?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded));
+            self.ingest_relmap_page(modification, &xlrec, decoded)?;
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
             if info == pg_constants::XLOG_NEXTOID {
@@ -266,9 +268,7 @@ impl<'a> WalIngest<'a> {
             {
                 let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
                 buf.copy_to_slice(&mut checkpoint_bytes);
-                let xlog_checkpoint = try_prr!(
-                    CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint")
-                );
+                let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
                 trace!(
                     "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
                     xlog_checkpoint.oldestXid,
@@ -289,32 +289,32 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk));
+            self.ingest_decoded_block(modification, lsn, decoded, blk)
+                .await?;
         }
 
         // If checkpoint data was updated, store the new version in the repository
         if self.checkpoint_modified {
-            let new_checkpoint_bytes =
-                try_prr!(self.checkpoint.encode().context("encode checkpoint"));
+            let new_checkpoint_bytes = self.checkpoint.encode()?;
 
-            try_prr!(modification.put_checkpoint(new_checkpoint_bytes));
+            modification.put_checkpoint(new_checkpoint_bytes)?;
             self.checkpoint_modified = false;
         }
 
         // Now that this record has been fully handled, including updating the
         // checkpoint data, let the repository know that it is up-to-date to this LSN
-        try_prr!(modification.commit());
+        modification.commit()?;
 
-        PageReconstructResult::Success(())
+        Ok(())
     }
 
-    fn ingest_decoded_block(
+    async fn ingest_decoded_block(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
-    ) -> PageReconstructResult<()> {
+    ) -> Result<(), PageReconstructError> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
             dbnode: blk.rnode_dbnode,
@@ -334,7 +334,7 @@ impl<'a> WalIngest<'a> {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
         // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version))
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
         {
             // Extract page image from FPI record
             let img_len = blk.bimg_len as usize;
@@ -356,28 +356,25 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            try_no_ondemand_download!(self.put_rel_page_image(
-                modification,
-                rel,
-                blk.blkno,
-                image.freeze()
-            ));
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
+                .await?;
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec));
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec)
+                .await?;
         }
-        PageReconstructResult::Success(())
+        Ok(())
     }
 
-    fn ingest_heapam_record(
+    async fn ingest_heapam_record(
         &mut self,
         buf: &mut Bytes,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         // Handle VM bit updates that are implicitly part of heap records.
 
         // First, look at the record to determine which VM bits need
@@ -456,7 +453,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -481,7 +478,8 @@ impl<'a> WalIngest<'a> {
                             old_heap_blkno,
                             flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                         },
-                    )?;
+                    )
+                    .await?;
                 } else {
                     // Clear VM bits for one heap page, or for two pages that reside on
                     // different VM pages.
@@ -495,7 +493,8 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno: None,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
-                        )?;
+                        )
+                        .await?;
                     }
                     if let Some(old_vm_blk) = old_vm_blk {
                         self.put_rel_wal_record(
@@ -507,7 +506,8 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
-                        )?;
+                        )
+                        .await?;
                     }
                 }
             }
@@ -517,9 +517,9 @@ impl<'a> WalIngest<'a> {
     }
 
     /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
-    fn ingest_xlog_dbase_create(
+    async fn ingest_xlog_dbase_create(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rec: &XlCreateDatabase,
     ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
@@ -534,18 +534,22 @@ impl<'a> WalIngest<'a> {
         // get calls instead.
         let req_lsn = modification.tline.get_last_record_lsn();
 
-        let rels = modification
-            .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)
-            .no_ondemand_download()?;
+        let rels = with_ondemand_download(|| {
+            modification
+                .tline
+                .list_rels(src_tablespace_id, src_db_id, req_lsn)
+        })
+        .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
 
         // Copy relfilemap
-        let filemap = modification
-            .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
-            .no_ondemand_download()?;
+        let filemap = with_ondemand_download(|| {
+            modification
+                .tline
+                .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+        })
+        .await?;
         modification.put_relmap_file(tablespace_id, db_id, filemap)?;
 
         let mut num_rels_copied = 0;
@@ -554,10 +558,9 @@ impl<'a> WalIngest<'a> {
             assert_eq!(src_rel.spcnode, src_tablespace_id);
             assert_eq!(src_rel.dbnode, src_db_id);
 
-            let nblocks = modification
-                .tline
-                .get_rel_size(src_rel, req_lsn, true)
-                .no_ondemand_download()?;
+            let nblocks =
+                with_ondemand_download(|| modification.tline.get_rel_size(src_rel, req_lsn, true))
+                    .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
                 dbnode: db_id,
@@ -572,10 +575,12 @@ impl<'a> WalIngest<'a> {
             for blknum in 0..nblocks {
                 debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
 
-                let content = modification
-                    .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
-                    .no_ondemand_download()?;
+                let content = with_ondemand_download(|| {
+                    modification
+                        .tline
+                        .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                })
+                .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
             }
@@ -594,7 +599,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification,
         rec: &XlSmgrCreate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let rel = RelTag {
             spcnode: rec.rnode.spcnode,
             dbnode: rec.rnode.dbnode,
@@ -608,11 +613,11 @@ impl<'a> WalIngest<'a> {
     /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
     ///
     /// This is the same logic as in PostgreSQL's smgr_redo() function.
-    fn ingest_xlog_smgr_truncate(
+    async fn ingest_xlog_smgr_truncate(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rec: &XlSmgrTruncate,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let spcnode = rec.rnode.spcnode;
         let dbnode = rec.rnode.dbnode;
         let relnode = rec.rnode.relnode;
@@ -642,7 +647,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            let nblocks = self.get_relsize(rel, modification.lsn).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
@@ -663,7 +668,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            let nblocks = self.get_relsize(rel, modification.lsn).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no)?;
@@ -674,9 +679,9 @@ impl<'a> WalIngest<'a> {
 
     /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
     ///
-    fn ingest_xact_record(
+    async fn ingest_xact_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
     ) -> anyhow::Result<()> {
@@ -735,10 +740,8 @@ impl<'a> WalIngest<'a> {
                     relnode: xnode.relnode,
                 };
                 let last_lsn = self.timeline.get_last_record_lsn();
-                if modification
-                    .tline
-                    .get_rel_exists(rel, last_lsn, true)
-                    .no_ondemand_download()?
+                if with_ondemand_download(|| modification.tline.get_rel_exists(rel, last_lsn, true))
+                    .await?
                 {
                     self.put_rel_drop(modification, rel)?;
                 }
@@ -747,9 +750,9 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn ingest_clog_truncate_record(
+    async fn ingest_clog_truncate_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         xlrec: &XlClogTruncate,
     ) -> anyhow::Result<()> {
         info!(
@@ -791,11 +794,14 @@ impl<'a> WalIngest<'a> {
         // it. So we use the previous record's LSN in the get calls
         // instead.
         let req_lsn = modification.tline.get_last_record_lsn();
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)
-            .no_ondemand_download()?
-        {
+
+        let slru_segments = with_ondemand_download(|| {
+            modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, req_lsn)
+        })
+        .await?;
+        for segno in slru_segments {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                 modification.drop_slru_segment(SlruKind::Clog, segno)?;
@@ -944,27 +950,26 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn put_rel_page_image(
+    async fn put_rel_page_image(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> PageReconstructResult<()> {
-        try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum));
-        try_prr!(modification.put_rel_page_image(rel, blknum, img));
-        PageReconstructResult::Success(())
+    ) -> anyhow::Result<()> {
+        self.handle_rel_extend(modification, rel, blknum).await?;
+        modification.put_rel_page_image(rel, blknum, img)?;
+        Ok(())
     }
 
-    fn put_rel_wal_record(
+    async fn put_rel_wal_record(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> Result<()> {
-        self.handle_rel_extend(modification, rel, blknum)
-            .no_ondemand_download()?;
+    ) -> anyhow::Result<()> {
+        self.handle_rel_extend(modification, rel, blknum).await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -984,69 +989,67 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
-        let nblocks = if !self
-            .timeline
-            .get_rel_exists(rel, lsn, true)
-            .no_ondemand_download()?
-        {
+    async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
+        let exists =
+            with_ondemand_download(|| self.timeline.get_rel_exists(rel, lsn, true)).await?;
+        let nblocks = if !exists {
             0
         } else {
-            self.timeline
-                .get_rel_size(rel, lsn, true)
-                .no_ondemand_download()?
+            with_ondemand_download(|| self.timeline.get_rel_size(rel, lsn, true)).await?
         };
         Ok(nblocks)
     }
 
-    fn handle_rel_extend(
+    async fn handle_rel_extend(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> PageReconstructResult<()> {
+    ) -> anyhow::Result<()> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
         let old_nblocks =
-            if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) {
+            if !with_ondemand_download(|| self.timeline.get_rel_exists(rel, last_lsn, true)).await?
+            {
                 // create it with 0 size initially, the logic below will extend it
-                try_prr!(modification.put_rel_creation(rel, 0));
+                modification.put_rel_creation(rel, 0)?;
                 0
             } else {
-                try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true))
+                with_ondemand_download(|| self.timeline.get_rel_size(rel, last_lsn, true)).await?
             };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            try_prr!(modification.put_rel_extend(rel, new_nblocks));
+            modification.put_rel_extend(rel, new_nblocks)?;
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
-                try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone()));
+                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
             }
         }
-        PageReconstructResult::Success(())
+        Ok(())
     }
 
-    fn put_slru_page_image(
+    async fn put_slru_page_image(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> Result<()> {
-        self.handle_slru_extend(modification, kind, segno, blknum)?;
+    ) -> anyhow::Result<()> {
+        self.handle_slru_extend(modification, kind, segno, blknum)
+            .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;
         Ok(())
     }
 
-    fn handle_slru_extend(
+    async fn handle_slru_extend(
         &mut self,
-        modification: &mut DatadirModification,
+        modification: &mut DatadirModification<'_>,
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
@@ -1060,18 +1063,17 @@ impl<'a> WalIngest<'a> {
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = self.timeline.get_last_record_lsn();
-        let old_nblocks = if !self
-            .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)
-            .no_ondemand_download()?
+        let old_nblocks = if !with_ondemand_download(|| {
+            self.timeline.get_slru_segment_exists(kind, segno, last_lsn)
+        })
+        .await?
         {
             // create it with 0 size initially, the logic below will extend it
             modification.put_slru_segment_creation(kind, segno, 0)?;
             0
         } else {
-            self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn)
-                .no_ondemand_download()?
+            with_ondemand_download(|| self.timeline.get_slru_segment_size(kind, segno, last_lsn))
+                .await?
         };
 
         if new_nblocks > old_nblocks {
@@ -1119,12 +1121,12 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
+    async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10)).no_ondemand_download()?;
+        let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
 
         Ok(walingest)
     }
@@ -1133,28 +1135,28 @@ mod tests {
     async fn test_relsize() -> Result<()> {
         let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest.put_rel_creation(&mut m, TESTREL_A)?;
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
 
         assert_current_logical_size(&tline, Lsn(0x50));
@@ -1292,7 +1294,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         assert_eq!(
             tline
@@ -1317,7 +1319,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
         assert_eq!(
             tline
@@ -1349,12 +1351,12 @@ mod tests {
     async fn test_drop_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
@@ -1391,7 +1393,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
-            .no_ondemand_download()?;
+            .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
@@ -1418,7 +1420,7 @@ mod tests {
     async fn test_truncate_extend() -> Result<()> {
         let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1427,7 +1429,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
-                .no_ondemand_download()?;
+                .await?;
         }
         m.commit()?;
 
@@ -1519,7 +1521,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
-                .no_ondemand_download()?;
+                .await?;
         }
         m.commit()?;
 
@@ -1556,7 +1558,7 @@ mod tests {
     async fn test_large_rel() -> Result<()> {
         let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&tline).await?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1565,7 +1567,7 @@ mod tests {
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
-                .no_ondemand_download()?;
+                .await?;
             m.commit()?;
         }
 
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index 3753807327..06aa132365 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -20,9 +20,7 @@ use tokio::{pin, select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{
-    metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
-};
+use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -175,8 +173,7 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest =
-        with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
 
     while let Some(replication_message) = {
         select! {
@@ -251,16 +248,10 @@ pub async fn handle_walreceiver_connection(
                         // at risk of hitting a deadlock.
                         ensure!(lsn.is_aligned());
 
-                        with_ondemand_download(|| {
-                            walingest.ingest_record(
-                                recdata.clone(),
-                                lsn,
-                                &mut modification,
-                                &mut decoded,
-                            )
-                        })
-                        .await
-                        .with_context(|| format!("could not ingest record at {lsn}"))?;
+                        walingest
+                            .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
+                            .await
+                            .with_context(|| format!("could not ingest record at {lsn}"))?;
 
                         fail_point!("walreceiver-after-ingest");
 

From 0b428f7c41679876a455505c1ed2dfb4d7dc03c0 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Fri, 30 Dec 2022 11:11:28 +0100
Subject: [PATCH 14/42] Enable licenses check for 3rd-parties

---
 .github/workflows/build_and_test.yml   |  6 ++
 compute_tools/Cargo.toml               |  1 +
 control_plane/Cargo.toml               |  1 +
 deny.toml                              | 90 ++++++++++++++++++++++++++
 libs/metrics/Cargo.toml                |  1 +
 libs/pageserver_api/Cargo.toml         |  1 +
 libs/postgres_connection/Cargo.toml    |  1 +
 libs/postgres_ffi/Cargo.toml           |  1 +
 libs/postgres_ffi/wal_craft/Cargo.toml |  2 +-
 libs/pq_proto/Cargo.toml               |  1 +
 libs/remote_storage/Cargo.toml         |  1 +
 libs/safekeeper_api/Cargo.toml         |  1 +
 libs/tenant_size_model/Cargo.toml      |  1 +
 libs/utils/Cargo.toml                  |  1 +
 pageserver/Cargo.toml                  |  1 +
 proxy/Cargo.toml                       |  1 +
 safekeeper/Cargo.toml                  |  1 +
 storage_broker/Cargo.toml              |  1 +
 18 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 deny.toml

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 17c698482c..9021ac48d9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -111,6 +111,7 @@ jobs:
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
         run: make postgres-headers -j$(nproc)
+
       - name: Run cargo clippy
         run: ./run_clippy.sh
 
@@ -126,6 +127,11 @@ jobs:
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
 
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
   build-neon:
     runs-on: [ self-hosted, dev, x64 ]
     container:
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index c40d870649..4c65649610 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -2,6 +2,7 @@
 name = "compute_tools"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 180508a01a..1c6cd6d882 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -2,6 +2,7 @@
 name = "control_plane"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/deny.toml b/deny.toml
new file mode 100644
index 0000000000..3a0fe36f87
--- /dev/null
+++ b/deny.toml
@@ -0,0 +1,90 @@
+# This file was auto-generated using `cargo deny init`.
+# cargo-deny is a cargo plugin that lets you lint your project's
+# dependency graph to ensure all your dependencies conform
+# to your expectations and requirements.
+
+# Root options
+targets = []
+all-features = false
+no-default-features = false
+feature-depth = 1
+
+# This section is considered when running `cargo deny check advisories`
+# More documentation for the advisories section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
+[advisories]
+db-urls = ["https://github.com/rustsec/advisory-db"]
+vulnerability = "deny"
+unmaintained = "warn"
+yanked = "warn"
+notice = "warn"
+ignore = []
+
+# This section is considered when running `cargo deny check licenses`
+# More documentation for the licenses section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
+[licenses]
+unlicensed = "deny"
+allow = [
+    "Apache-2.0",
+    "Artistic-2.0",
+    "BSD-2-Clause",
+    "BSD-3-Clause",
+    "ISC",
+    "MIT",
+    "MPL-2.0",
+    "OpenSSL",
+    "Unicode-DFS-2016",
+]
+deny = []
+copyleft = "warn"
+allow-osi-fsf-free = "neither"
+default = "deny"
+confidence-threshold = 0.8
+exceptions = [
+    # Zlib license has some restrictions if we decide to change sth
+    { allow = ["Zlib"], name = "const_format_proc_macros", version = "*" },
+    { allow = ["Zlib"], name = "const_format", version = "*" },
+]
+
+[[licenses.clarify]]
+name = "ring"
+version = "*"
+expression = "MIT AND ISC AND OpenSSL"
+license-files = [
+    { path = "LICENSE", hash = 0xbd0eed23 },
+]
+
+[licenses.private]
+ignore = true
+registries = []
+
+# This section is considered when running `cargo deny check bans`.
+# More documentation about the 'bans' section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
+[bans]
+multiple-versions = "warn"
+wildcards = "allow"
+highlight = "all"
+workspace-default-features = "allow"
+external-default-features = "allow"
+allow = []
+deny = []
+skip = []
+skip-tree = []
+
+# This section is considered when running `cargo deny check sources`.
+# More documentation about the 'sources' section can be found here:
+# https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
+[sources]
+unknown-registry = "warn"
+unknown-git = "warn"
+allow-registry = ["https://github.com/rust-lang/crates.io-index"]
+allow-git = []
+
+[sources.allow-org]
+github = [
+    "neondatabase",
+]
+gitlab = []
+bitbucket = []
diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml
index d0cd46d2a9..d155f1e07d 100644
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -2,6 +2,7 @@
 name = "metrics"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 2102ae5373..68d4c609f0 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pageserver_api"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml
index 1924b260fa..12b7abcc93 100644
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -2,6 +2,7 @@
 name = "postgres_connection"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 59eec3de32..aa076b08d3 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -2,6 +2,7 @@
 name = "postgres_ffi"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 rand = "0.8.3"
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index dd9f82a87a..abfc263550 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -2,7 +2,7 @@
 name = "wal_craft"
 version = "0.1.0"
 edition = "2021"
-
+license = "Apache-2.0"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 76d8fbf28d..daa0b593be 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pq_proto"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index ebd30fc1eb..5a39f27209 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -2,6 +2,7 @@
 name = "remote_storage"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 15bdecd71d..32cda78be4 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -2,6 +2,7 @@
 name = "safekeeper_api"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index 1aabf5a4f9..3a1a0f7915 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -3,6 +3,7 @@ name = "tenant_size_model"
 version = "0.1.0"
 edition = "2021"
 publish = false
+license = "Apache-2.0"
 
 [dependencies]
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 670270b63e..9c7fcafe23 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -2,6 +2,7 @@
 name = "utils"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index c0f3c76c4e..8f112fa670 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -2,6 +2,7 @@
 name = "pageserver"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [features]
 default = []
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index e630b2758d..0bf47c7b88 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -2,6 +2,7 @@
 name = "proxy"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 anyhow = "1.0"
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index fbcb3f34f7..d0c804fe4e 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -2,6 +2,7 @@
 name = "safekeeper"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [dependencies]
 async-stream = "0.3"
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 7aa33a5234..180c506254 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -2,6 +2,7 @@
 name = "storage_broker"
 version = "0.1.0"
 edition = "2021"
+license = "Apache-2.0"
 
 [features]
 bench = []

From e9583db73b3a930ca2cbf9267c5e05285cc1016f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 3 Jan 2023 20:11:32 +0200
Subject: [PATCH 15/42] Remove code and test to generate flamegraph on GetPage
 requests. (#3257)

It was nice to have and useful at the time, but unfortunately the method
used to gather the profiling data doesn't play nicely with 'async'. PR
#3228 will turn 'get_page_at_lsn' function async, which will break the
profiling support. Let's remove it, and re-introduce some kind of
profiling later, using some different method, if we feel like we need it
again.
---
 .github/workflows/build_and_test.yml         |   3 +-
 Cargo.lock                                   | 222 ++-----------------
 pageserver/Cargo.toml                        |   3 -
 pageserver/src/bin/pageserver.rs             |   9 +-
 pageserver/src/config.rs                     |  31 ---
 pageserver/src/lib.rs                        |   1 -
 pageserver/src/page_service.rs               |  11 +-
 pageserver/src/profiling.rs                  | 107 ---------
 run_clippy.sh                                |   4 +-
 test_runner/fixtures/neon_fixtures.py        |   4 -
 test_runner/fixtures/utils.py                |   2 +-
 test_runner/performance/README.md            |   8 +-
 test_runner/performance/test_perf_pgbench.py |  24 +-
 workspace_hack/Cargo.toml                    |   3 -
 14 files changed, 26 insertions(+), 406 deletions(-)
 delete mode 100644 pageserver/src/profiling.rs

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9021ac48d9..2b0b0ba2bf 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -183,13 +183,12 @@ jobs:
       # corresponding Cargo.toml files for their descriptions.
       - name: Set env variables
         run: |
+          CARGO_FEATURES="--features testing"
           if [[ $BUILD_TYPE == "debug" ]]; then
             cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FEATURES="--features testing"
             CARGO_FLAGS="--locked $CARGO_FEATURES"
           elif [[ $BUILD_TYPE == "release" ]]; then
             cov_prefix=""
-            CARGO_FEATURES="--features testing,profiling"
             CARGO_FLAGS="--locked --release $CARGO_FEATURES"
           fi
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
diff --git a/Cargo.lock b/Cargo.lock
index ad1fc67219..246d481ef9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -66,12 +66,6 @@ dependencies = [
  "backtrace",
 ]
 
-[[package]]
-name = "arrayvec"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
-
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -633,12 +627,6 @@ version = "3.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
 
-[[package]]
-name = "bytemuck"
-version = "1.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f"
-
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -899,7 +887,7 @@ dependencies = [
  "clap 4.0.29",
  "comfy-table",
  "git-version",
- "nix 0.25.1",
+ "nix",
  "once_cell",
  "pageserver_api",
  "postgres",
@@ -934,15 +922,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
 
-[[package]]
-name = "cpp_demangle"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f"
-dependencies = [
- "cfg-if",
-]
-
 [[package]]
 name = "cpufeatures"
 version = "0.2.5"
@@ -1066,7 +1045,7 @@ dependencies = [
  "crossterm_winapi",
  "libc",
  "mio",
- "parking_lot 0.12.1",
+ "parking_lot",
  "signal-hook",
  "signal-hook-mio",
  "winapi",
@@ -1176,15 +1155,6 @@ version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
 
-[[package]]
-name = "debugid"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730"
-dependencies = [
- "uuid 0.8.2",
-]
-
 [[package]]
 name = "debugid"
 version = "0.8.0"
@@ -1192,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
 dependencies = [
  "serde",
- "uuid 1.2.2",
+ "uuid",
 ]
 
 [[package]]
@@ -1318,18 +1288,6 @@ dependencies = [
  "windows-sys 0.42.0",
 ]
 
-[[package]]
-name = "findshlibs"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
-dependencies = [
- "cc",
- "lazy_static",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "fixedbitset"
 version = "0.4.2"
@@ -1793,24 +1751,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "inferno"
-version = "0.10.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f"
-dependencies = [
- "ahash",
- "atty",
- "indexmap",
- "itoa",
- "lazy_static",
- "log",
- "num-format",
- "quick-xml",
- "rgb",
- "str_stack",
-]
-
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -2037,15 +1977,6 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
-[[package]]
-name = "memmap2"
-version = "0.5.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.6.5"
@@ -2113,19 +2044,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
-[[package]]
-name = "nix"
-version = "0.23.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c"
-dependencies = [
- "bitflags",
- "cc",
- "cfg-if",
- "libc",
- "memoffset 0.6.5",
-]
-
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -2189,16 +2107,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "num-format"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
-dependencies = [
- "arrayvec",
- "itoa",
-]
-
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -2315,7 +2223,7 @@ dependencies = [
  "hyper",
  "itertools",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "num-traits",
  "once_cell",
  "pageserver_api",
@@ -2325,7 +2233,6 @@ dependencies = [
  "postgres-types",
  "postgres_connection",
  "postgres_ffi",
- "pprof",
  "pq_proto",
  "rand",
  "regex",
@@ -2369,17 +2276,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "parking_lot"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
-dependencies = [
- "instant",
- "lock_api",
- "parking_lot_core 0.8.5",
-]
-
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2387,21 +2283,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
  "lock_api",
- "parking_lot_core 0.9.5",
-]
-
-[[package]]
-name = "parking_lot_core"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
-dependencies = [
- "cfg-if",
- "instant",
- "libc",
- "redox_syscall",
- "smallvec",
- "winapi",
+ "parking_lot_core",
 ]
 
 [[package]]
@@ -2604,25 +2486,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "pprof"
-version = "0.6.1"
-source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9"
-dependencies = [
- "backtrace",
- "cfg-if",
- "findshlibs",
- "inferno",
- "lazy_static",
- "libc",
- "log",
- "nix 0.23.2",
- "parking_lot 0.11.2",
- "symbolic-demangle",
- "tempfile",
- "thiserror",
-]
-
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -2717,7 +2580,7 @@ dependencies = [
  "lazy_static",
  "libc",
  "memchr",
- "parking_lot 0.12.1",
+ "parking_lot",
  "procfs",
  "thiserror",
 ]
@@ -2798,7 +2661,7 @@ dependencies = [
  "md5",
  "metrics",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "pin-project-lite",
  "pq_proto",
  "rand",
@@ -2822,20 +2685,11 @@ dependencies = [
  "tracing-subscriber",
  "url",
  "utils",
- "uuid 1.2.2",
+ "uuid",
  "workspace_hack",
  "x509-parser",
 ]
 
-[[package]]
-name = "quick-xml"
-version = "0.22.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "quote"
 version = "1.0.21"
@@ -3027,15 +2881,6 @@ dependencies = [
  "winreg",
 ]
 
-[[package]]
-name = "rgb"
-version = "0.8.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3"
-dependencies = [
- "bytemuck",
-]
-
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3216,9 +3061,9 @@ dependencies = [
  "humantime",
  "hyper",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "postgres",
  "postgres-protocol",
  "postgres_ffi",
@@ -3396,7 +3241,7 @@ version = "0.29.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6"
 dependencies = [
- "debugid 0.8.0",
+ "debugid",
  "getrandom",
  "hex",
  "serde",
@@ -3404,7 +3249,7 @@ dependencies = [
  "thiserror",
  "time",
  "url",
- "uuid 1.2.2",
+ "uuid",
 ]
 
 [[package]]
@@ -3626,7 +3471,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
- "parking_lot 0.12.1",
+ "parking_lot",
  "prost",
  "tokio",
  "tokio-stream",
@@ -3637,12 +3482,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "str_stack"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
-
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -3690,29 +3529,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
 
-[[package]]
-name = "symbolic-common"
-version = "8.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540"
-dependencies = [
- "debugid 0.7.3",
- "memmap2",
- "stable_deref_trait",
- "uuid 0.8.2",
-]
-
-[[package]]
-name = "symbolic-demangle"
-version = "8.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750"
-dependencies = [
- "cpp_demangle",
- "rustc-demangle",
- "symbolic-common",
-]
-
 [[package]]
 name = "syn"
 version = "1.0.105"
@@ -3923,7 +3739,7 @@ dependencies = [
  "futures-channel",
  "futures-util",
  "log",
- "parking_lot 0.12.1",
+ "parking_lot",
  "percent-encoding",
  "phf",
  "pin-project-lite",
@@ -4314,7 +4130,7 @@ dependencies = [
  "hyper",
  "jsonwebtoken",
  "metrics",
- "nix 0.25.1",
+ "nix",
  "once_cell",
  "pq_proto",
  "rand",
@@ -4338,12 +4154,6 @@ dependencies = [
  "workspace_hack",
 ]
 
-[[package]]
-name = "uuid"
-version = "0.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
-
 [[package]]
 name = "uuid"
 version = "1.2.2"
@@ -4658,7 +4468,6 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
- "ahash",
  "anyhow",
  "bytes",
  "chrono",
@@ -4686,7 +4495,6 @@ dependencies = [
  "serde",
  "serde_json",
  "socket2",
- "stable_deref_trait",
  "syn",
  "tokio",
  "tokio-util",
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 8f112fa670..1854b6762f 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -10,8 +10,6 @@ default = []
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints"]
 
-profiling = ["pprof"]
-
 [dependencies]
 amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
 anyhow = { version = "1.0", features = ["backtrace"] }
@@ -40,7 +38,6 @@ pin-project-lite = "0.2.7"
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
-pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
 rand = "0.8.3"
 regex = "1.4.5"
 rstar = "0.9.3"
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a124bf85c2..18ec1ac68b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -13,7 +13,7 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, task_mgr,
+    http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
         BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
@@ -40,8 +40,6 @@ const FEATURES: &[&str] = &[
     "testing",
     #[cfg(feature = "fail/failpoints")]
     "fail/failpoints",
-    #[cfg(feature = "profiling")]
-    "profiling",
 ];
 
 fn version() -> String {
@@ -247,9 +245,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     // Install signal handlers
     let signals = signals::install_shutdown_handlers()?;
 
-    // Start profiler (if enabled)
-    let profiler_guard = profiling::init_profiler(conf);
-
     // Launch broker client
     WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
 
@@ -372,7 +367,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "Got {}. Terminating in immediate shutdown mode",
                 signal.name()
             );
-            profiling::exit_profiler(conf, &profiler_guard);
             std::process::exit(111);
         }
 
@@ -381,7 +375,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                 "Got {}. Terminating gracefully in fast shutdown mode",
                 signal.name()
             );
-            profiling::exit_profiler(conf, &profiler_guard);
             BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
             unreachable!()
         }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index deb79531a4..7b99d98581 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -138,7 +138,6 @@ pub struct PageServerConf {
     pub auth_validation_public_key_path: Option<PathBuf>,
     pub remote_storage_config: Option<RemoteStorageConfig>,
 
-    pub profiling: ProfilingConfig,
     pub default_tenant_conf: TenantConf,
 
     /// Storage broker endpoints to connect to.
@@ -165,25 +164,6 @@ pub struct PageServerConf {
 /// startup code to the connection code through a dozen layers.
 pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum ProfilingConfig {
-    Disabled,
-    PageRequests,
-}
-
-impl FromStr for ProfilingConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
-        let result = match s {
-            "disabled"  => ProfilingConfig::Disabled,
-            "page_requests"  => ProfilingConfig::PageRequests,
-            _ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
-        };
-        Ok(result)
-    }
-}
-
 // use dedicated enum for builder to better indicate the intention
 // and avoid possible confusion with nested options
 pub enum BuilderValue<T> {
@@ -226,7 +206,6 @@ struct PageServerConfigBuilder {
 
     id: BuilderValue<NodeId>,
 
-    profiling: BuilderValue<ProfilingConfig>,
     broker_endpoint: BuilderValue<Uri>,
     broker_keepalive_interval: BuilderValue<Duration>,
 
@@ -262,7 +241,6 @@ impl Default for PageServerConfigBuilder {
             auth_validation_public_key_path: Set(None),
             remote_storage_config: Set(None),
             id: NotSet,
-            profiling: Set(ProfilingConfig::Disabled),
             broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
                 .parse()
                 .expect("failed to parse default broker endpoint")),
@@ -348,10 +326,6 @@ impl PageServerConfigBuilder {
         self.id = BuilderValue::Set(node_id)
     }
 
-    pub fn profiling(&mut self, profiling: ProfilingConfig) {
-        self.profiling = BuilderValue::Set(profiling)
-    }
-
     pub fn log_format(&mut self, log_format: LogFormat) {
         self.log_format = BuilderValue::Set(log_format)
     }
@@ -405,7 +379,6 @@ impl PageServerConfigBuilder {
                 .remote_storage_config
                 .ok_or(anyhow!("missing remote_storage_config"))?,
             id: self.id.ok_or(anyhow!("missing id"))?,
-            profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
             // TenantConf is handled separately
             default_tenant_conf: TenantConf::default(),
             broker_endpoint: self
@@ -588,7 +561,6 @@ impl PageServerConf {
                     t_conf = Self::parse_toml_tenant_conf(item)?;
                 }
                 "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
-                "profiling" => builder.profiling(parse_toml_from_str(key, item)?),
                 "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
                 "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
                 "log_format" => builder.log_format(
@@ -722,7 +694,6 @@ impl PageServerConf {
             auth_type: AuthType::Trust,
             auth_validation_public_key_path: None,
             remote_storage_config: None,
-            profiling: ProfilingConfig::Disabled,
             default_tenant_conf: TenantConf::default(),
             broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
             broker_keepalive_interval: Duration::from_secs(5000),
@@ -898,7 +869,6 @@ log_format = 'json'
                 auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
-                profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 broker_keepalive_interval: humantime::parse_duration(
@@ -949,7 +919,6 @@ log_format = 'json'
                 auth_type: AuthType::Trust,
                 auth_validation_public_key_path: None,
                 remote_storage_config: None,
-                profiling: ProfilingConfig::Disabled,
                 default_tenant_conf: TenantConf::default(),
                 broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
                 broker_keepalive_interval: Duration::from_secs(5),
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 2f78c199b9..91cde477ad 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -9,7 +9,6 @@ pub(crate) mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
-pub mod profiling;
 pub mod repository;
 pub mod task_mgr;
 pub mod tenant;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5393fca780..f123168211 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -39,10 +39,9 @@ use utils::{
 
 use crate::auth::check_permission;
 use crate::basebackup;
-use crate::config::{PageServerConf, ProfilingConfig};
+use crate::config::PageServerConf;
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
-use crate::profiling::profpoint_start;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr;
@@ -250,7 +249,7 @@ impl PageRequestMetrics {
 
 #[derive(Debug)]
 struct PageServerHandler {
-    conf: &'static PageServerConf,
+    _conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 }
@@ -258,7 +257,7 @@ struct PageServerHandler {
 impl PageServerHandler {
     pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
         PageServerHandler {
-            conf,
+            _conf: conf,
             auth,
             claims: None,
         }
@@ -604,10 +603,6 @@ impl PageServerHandler {
         */
 
         let page = crate::tenant::with_ondemand_download(|| {
-            // FIXME: this profiling now happens at different place than it used to. The
-            // current profiling is based on a thread-local variable, so it doesn't work
-            // across awaits
-            let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
             timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
         })
         .await?;
diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs
deleted file mode 100644
index ad896cfa30..0000000000
--- a/pageserver/src/profiling.rs
+++ /dev/null
@@ -1,107 +0,0 @@
-//!
-//! Support for profiling
-//!
-//! This relies on a modified version of the 'pprof-rs' crate. That's not very
-//! nice, so to avoid a hard dependency on that, this is an optional feature.
-//!
-use crate::config::{PageServerConf, ProfilingConfig};
-
-/// The actual implementation is in the `profiling_impl` submodule. If the profiling
-/// feature is not enabled, it's just a dummy implementation that panics if you
-/// try to enabled profiling in the configuration.
-pub use profiling_impl::*;
-
-#[cfg(feature = "profiling")]
-mod profiling_impl {
-    use super::*;
-    use pprof;
-    use std::marker::PhantomData;
-
-    /// Start profiling the current thread. Returns a guard object;
-    /// the profiling continues until the guard is dropped.
-    ///
-    /// Note: profiling is not re-entrant. If you call 'profpoint_start' while
-    /// profiling is already started, nothing happens, and the profiling will be
-    /// stopped when either guard object is dropped.
-    #[inline]
-    pub fn profpoint_start(
-        conf: &crate::config::PageServerConf,
-        point: ProfilingConfig,
-    ) -> Option<ProfilingGuard> {
-        if conf.profiling == point {
-            pprof::start_profiling();
-            Some(ProfilingGuard(PhantomData))
-        } else {
-            None
-        }
-    }
-
-    /// A hack to remove Send and Sync from the ProfilingGuard. Because the
-    /// profiling is attached to current thread.
-    ////
-    /// See comments in https://github.com/rust-lang/rust/issues/68318
-    type PhantomUnsend = std::marker::PhantomData<*mut u8>;
-
-    pub struct ProfilingGuard(PhantomUnsend);
-
-    impl Drop for ProfilingGuard {
-        fn drop(&mut self) {
-            pprof::stop_profiling();
-        }
-    }
-
-    /// Initialize the profiler. This must be called before any 'profpoint_start' calls.
-    pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
-        if conf.profiling != ProfilingConfig::Disabled {
-            Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
-        } else {
-            None
-        }
-    }
-
-    /// Exit the profiler. Writes the flamegraph to current workdir.
-    pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
-        // Write out the flamegraph
-        if let Some(profiler_guard) = profiler_guard {
-            if let Ok(report) = profiler_guard.report().build() {
-                // this gets written under the workdir
-                let file = std::fs::File::create("flamegraph.svg").unwrap();
-                let mut options = pprof::flamegraph::Options::default();
-                options.image_width = Some(2500);
-                report.flamegraph_with_options(file, &mut options).unwrap();
-            }
-        }
-    }
-}
-
-/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
-#[cfg(not(feature = "profiling"))]
-mod profiling_impl {
-    use super::*;
-
-    pub struct DummyProfilerGuard;
-
-    impl Drop for DummyProfilerGuard {
-        fn drop(&mut self) {
-            // do nothing, this exists to calm Clippy down
-        }
-    }
-
-    pub fn profpoint_start(
-        _conf: &PageServerConf,
-        _point: ProfilingConfig,
-    ) -> Option<DummyProfilerGuard> {
-        None
-    }
-
-    pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
-        if conf.profiling != ProfilingConfig::Disabled {
-            // shouldn't happen, we don't allow profiling in the config if the support
-            // for it is disabled.
-            panic!("profiling enabled but the binary was compiled without profiling support");
-        }
-        None
-    }
-
-    pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
-}
diff --git a/run_clippy.sh b/run_clippy.sh
index bf770432d0..fe0e745d7d 100755
--- a/run_clippy.sh
+++ b/run_clippy.sh
@@ -9,8 +9,8 @@
 # In vscode, this setting is Rust-analyzer>Check On Save:Command
 
 
-# Not every feature is supported in macOS builds, e.g. `profiling`,
-# avoid running regular linting script that checks every feature.
+# Not every feature is supported in macOS builds. Avoid running regular linting
+# script that checks every feature.
 if [[ "$OSTYPE" == "darwin"* ]]; then
     # no extra features to test currently, add more here when needed
     cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b00ebdea7..705ab70ab4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1980,10 +1980,6 @@ class NeonPageserver(PgProtocol):
         if '"testing"' not in self.version:
             pytest.skip("pageserver was built without 'testing' feature")
 
-    def is_profiling_enabled_or_skip(self):
-        if '"profiling"' not in self.version:
-            pytest.skip("pageserver was built without 'profiling' feature")
-
     def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient:
         return PageserverHttpClient(
             port=self.service_port.http,
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 1fb9eb72e6..df83fc6377 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -148,7 +148,7 @@ def get_scale_for_db(size_mb: int) -> int:
 
 
 ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
+    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
 )
 
 
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index a32ce87c33..c1a57fb28b 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -1,12 +1,8 @@
 # Running locally
 
-First make a release build. The profiling flag is optional, used only for tests that
-generate flame graphs. The `-s` flag just silences a lot of output, and makes it
+First make a release build. The `-s` flag silences a lot of output, and makes it
 easier to see if you have compile errors without scrolling up.
-`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
-
-NOTE: the `profiling` flag only works on linux because we use linux-specific
-libc APIs like `libc::timer_t`.
+`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`
 
 Then run the tests
 `NEON_BIN=./target/release poetry run pytest test_runner/performance"`
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index 50e5366c1e..2b8760dff2 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -8,7 +8,7 @@ from typing import Dict, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult
-from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.compare_fixtures import PgCompare
 from fixtures.utils import get_scale_for_db
 
 
@@ -176,28 +176,6 @@ def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int):
     run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY)
 
 
-# Run the pgbench tests, and generate a flamegraph from it
-# This requires that the pageserver was built with the 'profiling' feature.
-#
-# TODO: If the profiling is cheap enough, there's no need to run the same test
-# twice, with and without profiling. But for now, run it separately, so that we
-# can see how much overhead the profiling adds.
-@pytest.mark.parametrize("scale", get_scales_matrix())
-@pytest.mark.parametrize("duration", get_durations_matrix())
-def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int):
-    neon_env_builder.pageserver_config_override = """
-profiling="page_requests"
-"""
-    env = neon_env_builder.init_start()
-    env.pageserver.is_profiling_enabled_or_skip()
-    env.neon_cli.create_branch("empty", "main")
-
-    neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench")
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT)
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
-    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
-
-
 # The following 3 tests run on an existing database as it was set up by previous tests,
 # and leaves the database in a state that would be used in the next tests.
 # Modifying the definition order of these functions or adding other remote tests in between will alter results.
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 4c7fbd8333..989cc9202e 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -13,7 +13,6 @@ publish = false
 
 ### BEGIN HAKARI SECTION
 [dependencies]
-ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] }
@@ -41,7 +40,6 @@ scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-stable_deref_trait = { version = "1", features = ["alloc", "std"] }
 tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
 tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
 tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
@@ -50,7 +48,6 @@ tracing-core = { version = "0.1", features = ["once_cell", "std"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]
-ahash = { version = "0.7", features = ["std"] }
 anyhow = { version = "1", features = ["backtrace", "std"] }
 bytes = { version = "1", features = ["serde", "std"] }
 either = { version = "1", features = ["use_std"] }

From 10dae79c6d78c8c0876dbd27bf46da8ca8b3b1ff Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 3 Jan 2023 22:42:04 +0200
Subject: [PATCH 16/42] Tone down safekeeper and pageserver walreceiver errors
 (#3227)

Closes https://github.com/neondatabase/neon/issues/3114

Adds more typization into errors that appear during protocol messages (`FeMessage`), postgres and walreceiver connections.

Socket IO errors are now better detected and logged with lesser (INFO, DEBUG) error level, without traces that they were logged before, when they were wrapped in anyhow context.
---
 Cargo.lock                                    |   1 +
 libs/pq_proto/Cargo.toml                      |   1 +
 libs/pq_proto/src/lib.rs                      | 131 +++++++---
 libs/utils/src/postgres_backend.rs            | 130 +++++-----
 libs/utils/src/postgres_backend_async.rs      | 174 +++++++++----
 libs/utils/tests/ssl_test.rs                  |  11 +-
 pageserver/src/page_service.rs                | 239 ++++++++++++------
 .../src/walreceiver/walreceiver_connection.rs |  66 +++--
 proxy/src/mgmt.rs                             |  15 +-
 proxy/src/stream.rs                           |  17 +-
 safekeeper/src/bin/safekeeper.rs              |   6 +-
 safekeeper/src/handler.rs                     |  65 +++--
 safekeeper/src/json_ctrl.rs                   |  23 +-
 safekeeper/src/receive_wal.rs                 |  47 ++--
 safekeeper/src/send_wal.rs                    |  22 +-
 safekeeper/src/wal_service.rs                 |   6 +-
 test_runner/fixtures/neon_fixtures.py         |  14 +-
 test_runner/regress/test_wal_acceptor.py      |   1 -
 18 files changed, 635 insertions(+), 334 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 246d481ef9..fbf018e1c0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2502,6 +2502,7 @@ dependencies = [
  "postgres-protocol",
  "rand",
  "serde",
+ "thiserror",
  "tokio",
  "tracing",
  "workspace_hack",
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index daa0b593be..b9c6a1eab0 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -13,5 +13,6 @@ rand = "0.8.3"
 serde = { version = "1.0", features = ["derive"] }
 tokio = { version = "1.17", features = ["macros"] }
 tracing = "0.1"
+thiserror = "1.0"
 
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index d31a2d51f2..c5e4dbd1f0 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -5,7 +5,7 @@
 // Tools for calling certain async methods in sync contexts.
 pub mod sync;
 
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
 use serde::{Deserialize, Serialize};
@@ -194,6 +194,35 @@ macro_rules! retry_read {
     };
 }
 
+/// An error occured during connection being open.
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionError {
+    /// IO error during writing to or reading from the connection socket.
+    #[error("Socket IO error: {0}")]
+    Socket(std::io::Error),
+    /// Invalid packet was received from client
+    #[error("Protocol error: {0}")]
+    Protocol(String),
+    /// Failed to parse a protocol mesage
+    #[error("Message parse error: {0}")]
+    MessageParse(anyhow::Error),
+}
+
+impl From<anyhow::Error> for ConnectionError {
+    fn from(e: anyhow::Error) -> Self {
+        Self::MessageParse(e)
+    }
+}
+
+impl ConnectionError {
+    pub fn into_io_error(self) -> io::Error {
+        match self {
+            ConnectionError::Socket(io) => io,
+            other => io::Error::new(io::ErrorKind::Other, other.to_string()),
+        }
+    }
+}
+
 impl FeMessage {
     /// Read one message from the stream.
     /// This function returns `Ok(None)` in case of EOF.
@@ -216,7 +245,9 @@ impl FeMessage {
     /// }
     /// ```
     #[inline(never)]
-    pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
+    pub fn read(
+        stream: &mut (impl io::Read + Unpin),
+    ) -> Result<Option<FeMessage>, ConnectionError> {
         Self::read_fut(&mut AsyncishRead(stream)).wait()
     }
 
@@ -224,7 +255,7 @@ impl FeMessage {
     /// See documentation for `Self::read`.
     pub fn read_fut<Reader>(
         stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
+    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
     where
         Reader: tokio::io::AsyncRead + Unpin,
     {
@@ -238,17 +269,21 @@ impl FeMessage {
             let tag = match retry_read!(stream.read_u8().await) {
                 Ok(b) => b,
                 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(e.into()),
+                Err(e) => return Err(ConnectionError::Socket(e)),
             };
 
             // The message length includes itself, so it better be at least 4.
-            let len = retry_read!(stream.read_u32().await)?
+            let len = retry_read!(stream.read_u32().await)
+                .map_err(ConnectionError::Socket)?
                 .checked_sub(4)
-                .context("invalid message length")?;
+                .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
 
             let body = {
                 let mut buffer = vec![0u8; len as usize];
-                stream.read_exact(&mut buffer).await?;
+                stream
+                    .read_exact(&mut buffer)
+                    .await
+                    .map_err(ConnectionError::Socket)?;
                 Bytes::from(buffer)
             };
 
@@ -265,7 +300,11 @@ impl FeMessage {
                 b'c' => Ok(Some(FeMessage::CopyDone)),
                 b'f' => Ok(Some(FeMessage::CopyFail)),
                 b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
-                tag => bail!("unknown message tag: {},'{:?}'", tag, body),
+                tag => {
+                    return Err(ConnectionError::Protocol(format!(
+                        "unknown message tag: {tag},'{body:?}'"
+                    )))
+                }
             }
         })
     }
@@ -275,7 +314,9 @@ impl FeStartupPacket {
     /// Read startup message from the stream.
     // XXX: It's tempting yet undesirable to accept `stream` by value,
     // since such a change will cause user-supplied &mut references to be consumed
-    pub fn read(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<Option<FeMessage>> {
+    pub fn read(
+        stream: &mut (impl io::Read + Unpin),
+    ) -> Result<Option<FeMessage>, ConnectionError> {
         Self::read_fut(&mut AsyncishRead(stream)).wait()
     }
 
@@ -284,7 +325,7 @@ impl FeStartupPacket {
     // since such a change will cause user-supplied &mut references to be consumed
     pub fn read_fut<Reader>(
         stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<Option<FeMessage>>> + '_>
+    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
     where
         Reader: tokio::io::AsyncRead + Unpin,
     {
@@ -302,31 +343,41 @@ impl FeStartupPacket {
             let len = match retry_read!(stream.read_u32().await) {
                 Ok(len) => len as usize,
                 Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(e.into()),
+                Err(e) => return Err(ConnectionError::Socket(e)),
             };
 
             #[allow(clippy::manual_range_contains)]
             if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
-                bail!("invalid message length");
+                return Err(ConnectionError::Protocol(format!(
+                    "invalid message length {len}"
+                )));
             }
 
-            let request_code = retry_read!(stream.read_u32().await)?;
+            let request_code =
+                retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
 
             // the rest of startup packet are params
             let params_len = len - 8;
             let mut params_bytes = vec![0u8; params_len];
-            stream.read_exact(params_bytes.as_mut()).await?;
+            stream
+                .read_exact(params_bytes.as_mut())
+                .await
+                .map_err(ConnectionError::Socket)?;
 
             // Parse params depending on request code
             let req_hi = request_code >> 16;
             let req_lo = request_code & ((1 << 16) - 1);
             let message = match (req_hi, req_lo) {
                 (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
-                    ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
+                    if params_len != 8 {
+                        return Err(ConnectionError::Protocol(
+                            "expected 8 bytes for CancelRequest params".to_string(),
+                        ));
+                    }
                     let mut cursor = Cursor::new(params_bytes);
                     FeStartupPacket::CancelRequest(CancelKeyData {
-                        backend_pid: cursor.read_i32().await?,
-                        cancel_key: cursor.read_i32().await?,
+                        backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
+                        cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
                     })
                 }
                 (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
@@ -338,7 +389,9 @@ impl FeStartupPacket {
                     FeStartupPacket::GssEncRequest
                 }
                 (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
-                    bail!("Unrecognized request code {}", unrecognized_code)
+                    return Err(ConnectionError::Protocol(format!(
+                        "Unrecognized request code {unrecognized_code}"
+                    )));
                 }
                 // TODO bail if protocol major_version is not 3?
                 (major_version, minor_version) => {
@@ -346,15 +399,21 @@ impl FeStartupPacket {
                     // See `postgres: ProcessStartupPacket, build_startup_packet`.
                     let mut tokens = str::from_utf8(&params_bytes)
                         .context("StartupMessage params: invalid utf-8")?
-                        .strip_suffix('\0') // drop packet's own null terminator
-                        .context("StartupMessage params: missing null terminator")?
+                        .strip_suffix('\0') // drop packet's own null
+                        .ok_or_else(|| {
+                            ConnectionError::Protocol(
+                                "StartupMessage params: missing null terminator".to_string(),
+                            )
+                        })?
                         .split_terminator('\0');
 
                     let mut params = HashMap::new();
                     while let Some(name) = tokens.next() {
-                        let value = tokens
-                            .next()
-                            .context("StartupMessage params: key without value")?;
+                        let value = tokens.next().ok_or_else(|| {
+                            ConnectionError::Protocol(
+                                "StartupMessage params: key without value".to_string(),
+                            )
+                        })?;
 
                         params.insert(name.to_owned(), value.to_owned());
                     }
@@ -458,7 +517,7 @@ pub enum BeMessage<'a> {
     CloseComplete,
     // None means column is NULL
     DataRow(&'a [Option<&'a [u8]>]),
-    ErrorResponse(&'a str),
+    ErrorResponse(&'a str, Option<&'a [u8; 5]>),
     /// Single byte - used in response to SSLRequest/GSSENCRequest.
     EncryptionResponse(bool),
     NoData,
@@ -606,7 +665,7 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }
 
 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
     let bytes = s.as_ref();
     if bytes.contains(&0) {
         return Err(io::Error::new(
@@ -626,7 +685,7 @@ fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
     Ok(result)
 }
 
-const SQLSTATE_INTERNAL_ERROR: &str = "XX000\0";
+pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
 
 impl<'a> BeMessage<'a> {
     /// Write message to the given buf.
@@ -767,10 +826,7 @@ impl<'a> BeMessage<'a> {
             // First byte of each field represents type of this field. Set just enough fields
             // to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
             // message text.
-            BeMessage::ErrorResponse(error_msg) => {
-                // For all the errors set Severity to Error and error code to
-                // 'internal error'.
-
+            BeMessage::ErrorResponse(error_msg, pg_error_code) => {
                 // 'E' signalizes ErrorResponse messages
                 buf.put_u8(b'E');
                 write_body(buf, |buf| {
@@ -778,7 +834,9 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"ERROR\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
+                    buf.put_slice(&terminate_code(
+                        pg_error_code.unwrap_or(SQLSTATE_INTERNAL_ERROR),
+                    ));
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg, buf)?;
@@ -801,7 +859,7 @@ impl<'a> BeMessage<'a> {
                     buf.put_slice(b"NOTICE\0");
 
                     buf.put_u8(b'C'); // SQLSTATE error code
-                    buf.put_slice(SQLSTATE_INTERNAL_ERROR.as_bytes());
+                    buf.put_slice(&terminate_code(SQLSTATE_INTERNAL_ERROR));
 
                     buf.put_u8(b'M'); // the message
                     write_cstr(error_msg.as_bytes(), buf)?;
@@ -1089,3 +1147,12 @@ mod tests {
         let _ = FeStartupPacket::read_fut(stream).await;
     }
 }
+
+fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
+    let mut terminated = [0; 6];
+    for (i, &elem) in code.iter().enumerate() {
+        terminated[i] = elem;
+    }
+
+    terminated
+}
diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs
index bac6f861c3..f3e3835bda 100644
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -3,8 +3,9 @@
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
 
+use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
 use crate::sock_split::{BidiStream, ReadStream, WriteStream};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
 use serde::{Deserialize, Serialize};
@@ -21,20 +22,32 @@ pub trait Handler {
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care).
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
+    fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
 
     /// Called on startup packet receival, allows to process params.
     ///
     /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
     /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
     /// to override whole init logic in implementations.
-    fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         Ok(())
     }
 
     /// Check auth jwt
-    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
-        bail!("JWT auth failed")
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
     }
 
     fn is_shutdown_requested(&self) -> bool {
@@ -66,7 +79,7 @@ impl FromStr for AuthType {
         match s {
             "Trust" => Ok(Self::Trust),
             "NeonJWT" => Ok(Self::NeonJWT),
-            _ => bail!("invalid value \"{s}\" for auth type"),
+            _ => anyhow::bail!("invalid value \"{s}\" for auth type"),
         }
     }
 }
@@ -154,7 +167,7 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
 }
 
 // Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
     std::str::from_utf8(without_null).map_err(|e| e.into())
 }
@@ -188,10 +201,10 @@ impl PostgresBackend {
     }
 
     /// Get direct reference (into the Option) to the read stream.
-    fn get_stream_in(&mut self) -> Result<&mut BidiStream> {
+    fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
         match &mut self.stream {
             Some(Stream::Bidirectional(stream)) => Ok(stream),
-            _ => bail!("reader taken"),
+            _ => anyhow::bail!("reader taken"),
         }
     }
 
@@ -215,7 +228,7 @@ impl PostgresBackend {
     }
 
     /// Read full message or return None if connection is closed.
-    pub fn read_message(&mut self) -> Result<Option<FeMessage>> {
+    pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
         let (state, stream) = (self.state, self.get_stream_in()?);
 
         use ProtoState::*;
@@ -223,6 +236,7 @@ impl PostgresBackend {
             Initialization | Encrypted => FeStartupPacket::read(stream),
             Authentication | Established => FeMessage::read(stream),
         }
+        .map_err(QueryError::from)
     }
 
     /// Write message into internal output buffer.
@@ -246,7 +260,7 @@ impl PostgresBackend {
     }
 
     // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub fn run(mut self, handler: &mut impl Handler) -> Result<()> {
+    pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
         let ret = self.run_message_loop(handler);
         if let Some(stream) = self.stream.as_mut() {
             let _ = stream.shutdown(Shutdown::Both);
@@ -254,7 +268,7 @@ impl PostgresBackend {
         ret
     }
 
-    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<()> {
+    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
         trace!("postgres backend to {:?} started", self.peer_addr);
 
         let mut unnamed_query_string = Bytes::new();
@@ -263,7 +277,7 @@ impl PostgresBackend {
             match self.read_message() {
                 Ok(message) => {
                     if let Some(msg) = message {
-                        trace!("got message {:?}", msg);
+                        trace!("got message {msg:?}");
 
                         match self.process_message(handler, msg, &mut unnamed_query_string)? {
                             ProcessMsgResult::Continue => continue,
@@ -274,10 +288,12 @@ impl PostgresBackend {
                     }
                 }
                 Err(e) => {
-                    // If it is a timeout error, continue the loop
-                    if !is_socket_read_timed_out(&e) {
-                        return Err(e);
+                    if let QueryError::Other(e) = &e {
+                        if is_socket_read_timed_out(e) {
+                            continue;
+                        }
                     }
+                    return Err(e);
                 }
             }
         }
@@ -295,7 +311,7 @@ impl PostgresBackend {
             }
             stream => {
                 self.stream = stream;
-                bail!("can't start TLs without bidi stream");
+                anyhow::bail!("can't start TLs without bidi stream");
             }
         }
     }
@@ -305,17 +321,16 @@ impl PostgresBackend {
         handler: &mut impl Handler,
         msg: FeMessage,
         unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
         // TODO: change that to proper top-level match of protocol state with separate message handling for each state
-        if self.state < ProtoState::Established {
-            ensure!(
-                matches!(
-                    msg,
-                    FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
-                ),
-                "protocol violation"
-            );
+        if self.state < ProtoState::Established
+            && !matches!(
+                msg,
+                FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
+            )
+        {
+            return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
         }
 
         let have_tls = self.tls_config.is_some();
@@ -339,8 +354,13 @@ impl PostgresBackend {
                     }
                     FeStartupPacket::StartupMessage { .. } => {
                         if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
-                            bail!("client did not connect with TLS");
+                            self.write_message(&BeMessage::ErrorResponse(
+                                "must connect with TLS",
+                                None,
+                            ))?;
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "client did not connect with TLS"
+                            )));
                         }
 
                         // NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -379,8 +399,11 @@ impl PostgresBackend {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
                         if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
+                            self.write_message(&BeMessage::ErrorResponse(
+                                &e.to_string(),
+                                Some(e.pg_error_code()),
+                            ))?;
+                            return Err(e);
                         }
                     }
                 }
@@ -394,33 +417,14 @@ impl PostgresBackend {
                 // remove null terminator
                 let query_string = cstr_to_str(&body)?;
 
-                trace!("got query {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string) {
-                    // ":?" uses the alternate formatting style, which makes anyhow display the
-                    // full cause of the error, not just the top-level context + its trace.
-                    // We don't want to send that in the ErrorResponse though,
-                    // because it's not relevant to the compute node logs.
-                    //
-                    // We also don't want to log full stacktrace when the error is primitive,
-                    // such as usual connection closed.
-                    let short_error = format!("{:#}", e);
-                    let root_cause = e.root_cause().to_string();
-                    if root_cause.contains("connection closed unexpectedly")
-                        || root_cause.contains("Broken pipe (os error 32)")
-                    {
-                        error!(
-                            "query handler for '{}' failed: {}",
-                            query_string, short_error
-                        );
-                    } else {
-                        error!("query handler for '{}' failed: {:?}", query_string, e);
-                    }
-                    self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?;
-                    // TODO: untangle convoluted control flow
-                    if e.to_string().contains("failed to run") {
-                        return Ok(ProcessMsgResult::Break);
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 self.write_message(&BeMessage::ReadyForQuery)?;
             }
@@ -445,11 +449,13 @@ impl PostgresBackend {
 
             FeMessage::Execute(_) => {
                 let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got execute {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string) {
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                    log_query_error(query_string, &e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 // NOTE there is no ReadyForQuery message. This handler is used
                 // for basebackup and it uses CopyOut which doesn't require
@@ -468,7 +474,9 @@ impl PostgresBackend {
             // We prefer explicit pattern matching to wildcards, because
             // this helps us spot the places where new variants are missing
             FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                bail!("unexpected message type: {:?}", msg);
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {msg:?}"
+                )));
             }
         }
 
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index de547c3242..a4f523da04 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -4,39 +4,84 @@
 //! is rather narrow, but we can extend it once required.
 
 use crate::postgres_backend::AuthType;
-use anyhow::{bail, Context, Result};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
 use std::future::Future;
+use std::io;
 use std::net::SocketAddr;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::Poll;
-use tracing::{debug, error, trace};
+use tracing::{debug, error, info, trace};
 
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
 use tokio_rustls::TlsAcceptor;
 
+pub fn is_expected_io_error(e: &io::Error) -> bool {
+    use io::ErrorKind::*;
+    matches!(e.kind(), ConnectionRefused | ConnectionAborted)
+}
+
+/// An error, occurred during query processing:
+/// either during the connection ([`ConnectionError`]) or before/after it.
+#[derive(thiserror::Error, Debug)]
+pub enum QueryError {
+    /// The connection was lost while processing the query.
+    #[error(transparent)]
+    Disconnected(#[from] ConnectionError),
+    /// Some other error
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<io::Error> for QueryError {
+    fn from(e: io::Error) -> Self {
+        Self::Disconnected(ConnectionError::Socket(e))
+    }
+}
+
+impl QueryError {
+    pub fn pg_error_code(&self) -> &'static [u8; 5] {
+        match self {
+            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
+        }
+    }
+}
+
 #[async_trait::async_trait]
 pub trait Handler {
     /// Handle single query.
     /// postgres_backend will issue ReadyForQuery after calling this (this
     /// might be not what we want after CopyData streaming, but currently we don't
     /// care).
-    async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
 
     /// Called on startup packet receival, allows to process params.
     ///
     /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
     /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
     /// to override whole init logic in implementations.
-    fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         Ok(())
     }
 
     /// Check auth jwt
-    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
-        bail!("JWT auth failed")
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
     }
 }
 
@@ -70,17 +115,14 @@ impl AsyncWrite for Stream {
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
         buf: &[u8],
-    ) -> Poll<Result<usize, std::io::Error>> {
+    ) -> Poll<io::Result<usize>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
             Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
             Self::Broken => unreachable!(),
         }
     }
-    fn poll_flush(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
             Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
@@ -90,7 +132,7 @@ impl AsyncWrite for Stream {
     fn poll_shutdown(
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    ) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
             Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
@@ -103,7 +145,7 @@ impl AsyncRead for Stream {
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
         buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
+    ) -> Poll<io::Result<()>> {
         match self.get_mut() {
             Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
             Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
@@ -139,7 +181,7 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
 }
 
 // Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
     std::str::from_utf8(without_null).map_err(|e| e.into())
 }
@@ -149,7 +191,7 @@ impl PostgresBackend {
         socket: tokio::net::TcpStream,
         auth_type: AuthType,
         tls_config: Option<Arc<rustls::ServerConfig>>,
-    ) -> std::io::Result<Self> {
+    ) -> io::Result<Self> {
         let peer_addr = socket.peer_addr()?;
 
         Ok(Self {
@@ -167,17 +209,18 @@ impl PostgresBackend {
     }
 
     /// Read full message or return None if connection is closed.
-    pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
         use ProtoState::*;
         match self.state {
             Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
             Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
             Closed => Ok(None),
         }
+        .map_err(QueryError::from)
     }
 
     /// Flush output buffer into the socket.
-    pub async fn flush(&mut self) -> std::io::Result<()> {
+    pub async fn flush(&mut self) -> io::Result<()> {
         while self.buf_out.has_remaining() {
             let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
             self.buf_out.advance(bytes_written);
@@ -187,7 +230,7 @@ impl PostgresBackend {
     }
 
     /// Write message into internal output buffer.
-    pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
+    pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
         BeMessage::write(&mut self.buf_out, message)?;
         Ok(self)
     }
@@ -223,7 +266,11 @@ impl PostgresBackend {
     }
 
     // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
+    pub async fn run<F, S>(
+        mut self,
+        handler: &mut impl Handler,
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
     where
         F: Fn() -> S,
         S: Future,
@@ -237,7 +284,7 @@ impl PostgresBackend {
         &mut self,
         handler: &mut impl Handler,
         shutdown_watcher: F,
-    ) -> Result<()>
+    ) -> Result<(), QueryError>
     where
         F: Fn() -> S,
         S: Future,
@@ -273,7 +320,7 @@ impl PostgresBackend {
                         return Ok(());
                     }
                 }
-                Ok::<(), anyhow::Error>(())
+                Ok::<(), QueryError>(())
             } => {
                 // Handshake complete.
                 result?;
@@ -318,14 +365,14 @@ impl PostgresBackend {
             self.stream = Stream::Tls(Box::new(tls_stream));
             return Ok(());
         };
-        bail!("TLS already started");
+        anyhow::bail!("TLS already started");
     }
 
     async fn process_handshake_message(
         &mut self,
         handler: &mut impl Handler,
         msg: FeMessage,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         assert!(self.state < ProtoState::Established);
         let have_tls = self.tls_config.is_some();
         match msg {
@@ -348,8 +395,13 @@ impl PostgresBackend {
                     }
                     FeStartupPacket::StartupMessage { .. } => {
                         if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
-                            bail!("client did not connect with TLS");
+                            self.write_message(&BeMessage::ErrorResponse(
+                                "must connect with TLS",
+                                None,
+                            ))?;
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "client did not connect with TLS"
+                            )));
                         }
 
                         // NB: startup() may change self.auth_type -- we are using that in proxy code
@@ -389,8 +441,11 @@ impl PostgresBackend {
                         let (_, jwt_response) = m.split_last().context("protocol violation")?;
 
                         if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                            bail!("auth failed: {}", e);
+                            self.write_message(&BeMessage::ErrorResponse(
+                                &e.to_string(),
+                                Some(e.pg_error_code()),
+                            ))?;
+                            return Err(e);
                         }
                     }
                 }
@@ -413,33 +468,28 @@ impl PostgresBackend {
         handler: &mut impl Handler,
         msg: FeMessage,
         unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult> {
+    ) -> Result<ProcessMsgResult, QueryError> {
         // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
         // TODO: change that to proper top-level match of protocol state with separate message handling for each state
         assert!(self.state == ProtoState::Established);
 
         match msg {
             FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
-                bail!("protocol violation");
+                return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
             }
 
             FeMessage::Query(body) => {
                 // remove null terminator
                 let query_string = cstr_to_str(&body)?;
 
-                trace!("got query {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
-                    // ":?" uses the alternate formatting style, which makes anyhow display the
-                    // full cause of the error, not just the top-level context + its trace.
-                    // We don't want to send that in the ErrorResponse though,
-                    // because it's not relevant to the compute node logs.
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
-                    // TODO: untangle convoluted control flow
-                    if e.to_string().contains("failed to run") {
-                        return Ok(ProcessMsgResult::Break);
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 self.write_message(&BeMessage::ReadyForQuery)?;
             }
@@ -464,11 +514,13 @@ impl PostgresBackend {
 
             FeMessage::Execute(_) => {
                 let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {:?}", query_string);
-                // xxx distinguish fatal and recoverable errors?
+                trace!("got execute {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
-                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                    log_query_error(query_string, &e);
+                    self.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
                 }
                 // NOTE there is no ReadyForQuery message. This handler is used
                 // for basebackup and it uses CopyOut which doesn't require
@@ -487,7 +539,10 @@ impl PostgresBackend {
             // We prefer explicit pattern matching to wildcards, because
             // this helps us spot the places where new variants are missing
             FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                bail!("unexpected message type: {:?}", msg);
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {:?}",
+                    msg
+                )));
             }
         }
 
@@ -555,3 +610,28 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
         this.pgb.poll_flush(cx)
     }
 }
+
+pub fn short_error(e: &QueryError) -> String {
+    match e {
+        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Other(e) => format!("{e:#}"),
+    }
+}
+
+pub(super) fn log_query_error(query: &str, e: &QueryError) {
+    match e {
+        QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
+            if is_expected_io_error(io_error) {
+                info!("query handler for '{query}' failed with expected io error: {io_error}");
+            } else {
+                error!("query handler for '{query}' failed with io error: {io_error}");
+            }
+        }
+        QueryError::Disconnected(other_connection_error) => {
+            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
+        }
+        QueryError::Other(e) => {
+            error!("query handler for '{query}' failed: {e:?}");
+        }
+    }
+}
diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs
index 248400c2c1..fae707f049 100644
--- a/libs/utils/tests/ssl_test.rs
+++ b/libs/utils/tests/ssl_test.rs
@@ -9,7 +9,10 @@ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use once_cell::sync::Lazy;
 
-use utils::postgres_backend::{AuthType, Handler, PostgresBackend};
+use utils::{
+    postgres_backend::{AuthType, Handler, PostgresBackend},
+    postgres_backend_async::QueryError,
+};
 
 fn make_tcp_pair() -> (TcpStream, TcpStream) {
     let listener = TcpListener::bind("127.0.0.1:0").unwrap();
@@ -105,7 +108,7 @@ fn ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             self.got_query = query_string == QUERY;
             Ok(())
         }
@@ -152,7 +155,7 @@ fn no_ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             _query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             panic!()
         }
     }
@@ -212,7 +215,7 @@ fn server_forces_ssl() {
             &mut self,
             _pgb: &mut PostgresBackend,
             _query_string: &str,
-        ) -> anyhow::Result<()> {
+        ) -> Result<(), QueryError> {
             panic!()
         }
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f123168211..4087a8f90c 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -9,7 +9,7 @@
 //  custom protocol.
 //
 
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
@@ -19,6 +19,8 @@ use pageserver_api::models::{
     PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
     PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
+use pq_proto::ConnectionError;
+use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
 use std::net::TcpListener;
@@ -28,6 +30,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use tracing::*;
 use utils::id::ConnectionId;
+use utils::postgres_backend_async::QueryError;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
@@ -60,8 +63,8 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                 _ = task_mgr::shutdown_watcher() => {
                     // We were requested to shut down.
                     let msg = format!("pageserver is shutting down");
-                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg));
-                    Err(anyhow::anyhow!(msg))
+                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
+                    Err(QueryError::Other(anyhow::anyhow!(msg)))
                 }
 
                 msg = pgb.read_message() => { msg }
@@ -74,14 +77,15 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                         FeMessage::CopyDone => { break },
                         FeMessage::Sync => continue,
                         FeMessage::Terminate => {
-                            let msg = format!("client terminated connection with Terminate message during COPY");
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            let msg = "client terminated connection with Terminate message during COPY";
+                            let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                            pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
                             Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                             break;
                         }
                         m => {
-                            let msg = format!("unexpected message {:?}", m);
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            let msg = format!("unexpected message {m:?}");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
                             Err(io::Error::new(io::ErrorKind::Other, msg))?;
                             break;
                         }
@@ -91,12 +95,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                 }
                 Ok(None) => {
                     let msg = "client closed connection during COPY";
-                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
+                    let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                    pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
                     pgb.flush().await?;
                     Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                 }
-                Err(e) => {
-                    Err(io::Error::new(io::ErrorKind::Other, e))?;
+                Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
+                    Err(io_error)?;
+                }
+                Err(other) => {
+                    Err(io::Error::new(io::ErrorKind::Other, other))?;
                 }
             };
         }
@@ -194,23 +202,19 @@ async fn page_service_conn_main(
             // we've been requested to shut down
             Ok(())
         }
-        Err(err) => {
-            let root_cause_io_err_kind = err
-                .root_cause()
-                .downcast_ref::<io::Error>()
-                .map(|e| e.kind());
-
+        Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
             // `ConnectionReset` error happens when the Postgres client closes the connection.
             // As this disconnection happens quite often and is expected,
             // we decided to downgrade the logging level to `INFO`.
             // See: https://github.com/neondatabase/neon/issues/1683.
-            if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) {
+            if io_error.kind() == io::ErrorKind::ConnectionReset {
                 info!("Postgres client disconnected");
                 Ok(())
             } else {
-                Err(err)
+                Err(io_error).context("Postgres connection error")
             }
         }
+        other => other.context("Postgres query error"),
     }
 }
 
@@ -312,7 +316,7 @@ impl PageServerHandler {
                 Some(FeMessage::CopyData(bytes)) => bytes,
                 Some(FeMessage::Terminate) => break,
                 Some(m) => {
-                    bail!("unexpected message: {m:?} during COPY");
+                    anyhow::bail!("unexpected message: {m:?} during COPY");
                 }
                 None => break, // client disconnected
             };
@@ -369,7 +373,7 @@ impl PageServerHandler {
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
@@ -423,11 +427,16 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
-        ensure!(timeline.get_last_record_lsn() == start_lsn);
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
 
         // TODO leave clean state on error. For now you can use detach to clean
         // up broken state from a failed import.
@@ -451,7 +460,11 @@ impl PageServerHandler {
         }
 
         // TODO Does it make sense to overshoot?
-        ensure!(timeline.get_last_record_lsn() >= end_lsn);
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(QueryError::Other(
+                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
+            );
+        }
 
         // Flush data to disk, then upload to s3. No need for a forced checkpoint.
         // We only want to persist the data, and it doesn't matter if it's in the
@@ -480,7 +493,7 @@ impl PageServerHandler {
         mut lsn: Lsn,
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
-    ) -> Result<Lsn> {
+    ) -> anyhow::Result<Lsn> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
             // to the page server that there have been no modifications to the
@@ -511,11 +524,11 @@ impl PageServerHandler {
             }
         } else {
             if lsn == Lsn(0) {
-                bail!("invalid LSN(0) in request");
+                anyhow::bail!("invalid LSN(0) in request");
             }
             timeline.wait_lsn(lsn).await?;
         }
-        ensure!(
+        anyhow::ensure!(
             lsn >= **latest_gc_cutoff_lsn,
             "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
             lsn, **latest_gc_cutoff_lsn
@@ -528,7 +541,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -548,7 +561,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -568,7 +581,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -589,7 +602,7 @@ impl PageServerHandler {
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
-    ) -> Result<PagestreamBeMessage> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
             .await?;
@@ -656,7 +669,7 @@ impl PageServerHandler {
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
         if self.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
@@ -678,20 +691,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
         &mut self,
         _pgb: &mut PostgresBackend,
         jwt_response: &[u8],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
             .auth
             .as_ref()
             .unwrap()
-            .decode(str::from_utf8(jwt_response)?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
 
-        if matches!(data.claims.scope, Scope::Tenant) {
-            ensure!(
-                data.claims.tenant_id.is_some(),
+        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
+            return Err(QueryError::Other(anyhow::anyhow!(
                 "jwt token scope is Tenant, but tenant id is missing"
-            )
+            )));
         }
 
         info!(
@@ -703,22 +715,33 @@ impl postgres_backend_async::Handler for PageServerHandler {
         Ok(())
     }
 
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        Ok(())
+    }
+
     async fn process_query(
         &mut self,
         pgb: &mut PostgresBackend,
         query_string: &str,
-    ) -> anyhow::Result<()> {
-        debug!("process query {:?}", query_string);
+    ) -> Result<(), QueryError> {
+        debug!("process query {query_string:?}");
 
         if query_string.starts_with("pagestream ") {
             let (_, params_raw) = query_string.split_at("pagestream ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
-            ensure!(
-                params.len() == 2,
-                "invalid param number for pagestream command"
-            );
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -728,18 +751,24 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() >= 2,
-                "invalid param number for basebackup command"
-            );
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for basebackup command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
 
             let lsn = if params.len() == 3 {
-                Some(Lsn::from_str(params[2])?)
+                Some(
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                )
             } else {
                 None
             };
@@ -754,13 +783,16 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() == 2,
-                "invalid param number for get_last_record_rlsn command"
-            );
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for get_last_record_rlsn command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
             let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
@@ -782,22 +814,31 @@ impl postgres_backend_async::Handler for PageServerHandler {
             let (_, params_raw) = query_string.split_at("fullbackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
 
-            ensure!(
-                params.len() >= 2,
-                "invalid param number for fullbackup command"
-            );
+            if params.len() < 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for fullbackup command"
+                )));
+            }
 
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             // The caller is responsible for providing correct lsn and prev_lsn.
             let lsn = if params.len() > 2 {
-                Some(Lsn::from_str(params[2])?)
+                Some(
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                )
             } else {
                 None
             };
             let prev_lsn = if params.len() > 3 {
-                Some(Lsn::from_str(params[3])?)
+                Some(
+                    Lsn::from_str(params[3])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
+                )
             } else {
                 None
             };
@@ -822,12 +863,21 @@ impl postgres_backend_async::Handler for PageServerHandler {
             //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
             let (_, params_raw) = query_string.split_at("import basebackup ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
-            ensure!(params.len() == 5);
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
-            let base_lsn = Lsn::from_str(params[2])?;
-            let end_lsn = Lsn::from_str(params[3])?;
-            let pg_version = u32::from_str(params[4])?;
+            if params.len() != 5 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import basebackup command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let base_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
+            let pg_version = u32::from_str(params[4])
+                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -845,7 +895,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
+                    pgb.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
                 }
             };
         } else if query_string.starts_with("import wal ") {
@@ -855,11 +908,19 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // caller should poll the http api to check when that is done.
             let (_, params_raw) = query_string.split_at("import wal ".len());
             let params = params_raw.split_whitespace().collect::<Vec<_>>();
-            ensure!(params.len() == 4);
-            let tenant_id = TenantId::from_str(params[0])?;
-            let timeline_id = TimelineId::from_str(params[1])?;
-            let start_lsn = Lsn::from_str(params[2])?;
-            let end_lsn = Lsn::from_str(params[3])?;
+            if params.len() != 4 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for import wal command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+            let start_lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+            let end_lsn = Lsn::from_str(params[3])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -870,7 +931,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
                 Err(e) => {
                     error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?
+                    pgb.write_message(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
                 }
             };
         } else if query_string.to_ascii_lowercase().starts_with("set ") {
@@ -881,8 +945,13 @@ impl postgres_backend_async::Handler for PageServerHandler {
             // show <tenant_id>
             let (_, params_raw) = query_string.split_at("show ".len());
             let params = params_raw.split(' ').collect::<Vec<_>>();
-            ensure!(params.len() == 1, "invalid param number for config command");
-            let tenant_id = TenantId::from_str(params[0])?;
+            if params.len() != 1 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for config command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
 
             self.check_permission(Some(tenant_id))?;
 
@@ -923,7 +992,9 @@ impl postgres_backend_async::Handler for PageServerHandler {
             ]))?
             .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else {
-            bail!("unknown command");
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "unknown command {query_string}"
+            )));
         }
 
         Ok(())
@@ -935,7 +1006,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
+async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
     let tenant = mgr::get_tenant(tenant_id, false).await?;
     match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
         Ok(wait_result) => wait_result
@@ -949,7 +1020,7 @@ async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenan
 async fn get_active_timeline_with_timeout(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-) -> Result<Arc<Timeline>> {
+) -> anyhow::Result<Arc<Timeline>> {
     get_active_tenant_with_timeout(tenant_id)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs
index 06aa132365..aca5e8e019 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -1,6 +1,7 @@
 //! Actual Postgres connection handler to stream WAL to the server.
 
 use std::{
+    error::Error,
     str::FromStr,
     sync::Arc,
     time::{Duration, SystemTime},
@@ -11,7 +12,7 @@ use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
-use postgres::{SimpleQueryMessage, SimpleQueryRow};
+use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_protocol::message::backend::ReplicationMessage;
@@ -32,7 +33,7 @@ use crate::{
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use pq_proto::ReplicationFeedback;
-use utils::lsn::Lsn;
+use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error};
 
 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -68,10 +69,17 @@ pub async fn handle_walreceiver_connection(
         let mut config = wal_source_connconf.to_tokio_postgres_config();
         config.application_name("pageserver");
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
-        time::timeout(connect_timeout, config.connect(postgres::NoTls))
-            .await
-            .context("Timed out while waiting for walreceiver connection to open")?
-            .context("Failed to open walreceiver connection")?
+        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
+            Ok(Ok(client_and_conn)) => client_and_conn,
+            Ok(Err(conn_err)) => {
+                let expected_error = ignore_expected_errors(conn_err)?;
+                info!("DB connection stream finished: {expected_error}");
+                return Ok(());
+            }
+            Err(elapsed) => anyhow::bail!(
+                "Timed out while waiting {elapsed} for walreceiver connection to open"
+            ),
+        }
     };
 
     info!("connected!");
@@ -103,10 +111,8 @@ pub async fn handle_walreceiver_connection(
                 connection_result = connection => match connection_result{
                     Ok(()) => info!("Walreceiver db connection closed"),
                     Err(connection_error) => {
-                        if connection_error.is_closed() {
-                            info!("Connection closed regularly: {connection_error}")
-                        } else {
-                            warn!("Connection aborted: {connection_error}")
+                        if let Err(e) = ignore_expected_errors(connection_error) {
+                            warn!("Connection aborted: {e:#}")
                         }
                     }
                 },
@@ -187,14 +193,9 @@ pub async fn handle_walreceiver_connection(
         let replication_message = match replication_message {
             Ok(message) => message,
             Err(replication_error) => {
-                if replication_error.is_closed() {
-                    info!("Replication stream got closed");
-                    return Ok(());
-                } else {
-                    return Err(
-                        anyhow::Error::new(replication_error).context("replication stream error")
-                    );
-                }
+                let expected_error = ignore_expected_errors(replication_error)?;
+                info!("Replication stream finished: {expected_error}");
+                return Ok(());
             }
         };
 
@@ -400,3 +401,32 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
         Err(IdentifyError.into())
     }
 }
+
+/// We don't want to report connectivity problems as real errors towards connection manager because
+/// 1. they happen frequently enough to make server logs hard to read and
+/// 2. the connection manager can retry other safekeeper.
+///
+/// If this function returns `Ok(pg_error)`, it's such an error.
+/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+/// Connection manager will then handle reconnections.
+///
+/// If this function returns an `Err()`, the caller can bubble it up using `?`.
+/// The connection manager will log the error at ERROR level.
+fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
+    if pg_error.is_closed()
+        || pg_error
+            .source()
+            .and_then(|source| source.downcast_ref::<std::io::Error>())
+            .map(is_expected_io_error)
+            .unwrap_or(false)
+    {
+        return Ok(pg_error);
+    } else if let Some(db_error) = pg_error.as_db_error() {
+        if db_error.code() == &SqlState::CONNECTION_FAILURE
+            && db_error.message().contains("end streaming")
+        {
+            return Ok(pg_error);
+        }
+    }
+    Err(pg_error).context("connection error")
+}
diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs
index 2e0a502e7f..cf83b48ae0 100644
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -9,7 +9,10 @@ use std::{
     thread,
 };
 use tracing::{error, info, info_span};
-use utils::postgres_backend::{self, AuthType, PostgresBackend};
+use utils::{
+    postgres_backend::{self, AuthType, PostgresBackend},
+    postgres_backend_async::QueryError,
+};
 
 /// Console management API listener thread.
 /// It spawns console response handlers needed for the link auth.
@@ -47,7 +50,7 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
     }
 }
 
-fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
+fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
     let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
     pgbackend.run(&mut MgmtHandler)
 }
@@ -58,7 +61,7 @@ pub type ComputeReady = Result<DatabaseInfo, String>;
 // TODO: replace with an http-based protocol.
 struct MgmtHandler;
 impl postgres_backend::Handler for MgmtHandler {
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
+    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
         try_process_query(pgb, query).map_err(|e| {
             error!("failed to process response: {e:?}");
             e
@@ -66,8 +69,8 @@ impl postgres_backend::Handler for MgmtHandler {
     }
 }
 
-fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
-    let resp: KickSession = serde_json::from_str(query)?;
+fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> Result<(), QueryError> {
+    let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);
     let _enter = span.enter();
@@ -81,7 +84,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
         }
         Err(e) => {
             error!("failed to deliver response to per-client task");
-            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string(), None))?;
         }
     }
 
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 19e1479068..02a0fabe9a 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -2,7 +2,7 @@ use crate::error::UserFacingError;
 use anyhow::bail;
 use bytes::BytesMut;
 use pin_project_lite::pin_project;
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
+use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket};
 use rustls::ServerConfig;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -47,18 +47,13 @@ fn err_connection() -> io::Error {
     io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost")
 }
 
-// TODO: change error type of `FeMessage::read_fut`
-fn from_anyhow(e: anyhow::Error) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, e.to_string())
-}
-
 impl<S: AsyncRead + Unpin> PqStream<S> {
     /// Receive [`FeStartupPacket`], which is a first packet sent by a client.
     pub async fn read_startup_packet(&mut self) -> io::Result<FeStartupPacket> {
         // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket`
         let msg = FeStartupPacket::read_fut(&mut self.stream)
             .await
-            .map_err(from_anyhow)?
+            .map_err(ConnectionError::into_io_error)?
             .ok_or_else(err_connection)?;
 
         match msg {
@@ -80,7 +75,7 @@ impl<S: AsyncRead + Unpin> PqStream<S> {
     async fn read_message(&mut self) -> io::Result<FeMessage> {
         FeMessage::read_fut(&mut self.stream)
             .await
-            .map_err(from_anyhow)?
+            .map_err(ConnectionError::into_io_error)?
             .ok_or_else(err_connection)
     }
 }
@@ -112,7 +107,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
     pub async fn throw_error_str<T>(&mut self, error: &'static str) -> anyhow::Result<T> {
         tracing::info!("forwarding error to user: {error}");
-        self.write_message(&BeMessage::ErrorResponse(error)).await?;
+        self.write_message(&BeMessage::ErrorResponse(error, None))
+            .await?;
         bail!(error)
     }
 
@@ -124,7 +120,8 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
     {
         let msg = error.to_string_client();
         tracing::info!("forwarding error to user: {msg}");
-        self.write_message(&BeMessage::ErrorResponse(&msg)).await?;
+        self.write_message(&BeMessage::ErrorResponse(&msg, None))
+            .await?;
         bail!(error)
     }
 }
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 394a4815bb..b130ea86bd 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -229,11 +229,7 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let conf_cloned = conf.clone();
     let safekeeper_thread = thread::Builder::new()
         .name("safekeeper thread".into())
-        .spawn(|| {
-            if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) {
-                info!("safekeeper thread terminated: {e}");
-            }
-        })
+        .spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
         .unwrap();
 
     threads.push(safekeeper_thread);
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index c692e9fc12..60df5dd372 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -8,7 +8,7 @@ use crate::receive_wal::ReceiveWalConn;
 use crate::send_wal::ReplicationConn;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::Context;
 
 use postgres_ffi::PG_TLI;
 use regex::Regex;
@@ -17,6 +17,7 @@ use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use std::str;
 use tracing::info;
 use utils::auth::{Claims, Scope};
+use utils::postgres_backend_async::QueryError;
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -42,7 +43,7 @@ enum SafekeeperPostgresCommand {
     JSONCtrl { cmd: AppendLogicalMessage },
 }
 
-fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
+fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     if cmd.starts_with("START_WAL_PUSH") {
         Ok(SafekeeperPostgresCommand::StartWalPush)
     } else if cmd.starts_with("START_REPLICATION") {
@@ -62,13 +63,17 @@ fn parse_cmd(cmd: &str) -> Result<SafekeeperPostgresCommand> {
             cmd: serde_json::from_str(cmd)?,
         })
     } else {
-        bail!("unsupported command {}", cmd);
+        anyhow::bail!("unsupported command {cmd}");
     }
 }
 
 impl postgres_backend::Handler for SafekeeperPostgresHandler {
     // tenant_id and timeline_id are passed in connection string params
-    fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> {
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend,
+        sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
         if let FeStartupPacket::StartupMessage { params, .. } = sm {
             if let Some(options) = params.options_raw() {
                 for opt in options {
@@ -77,10 +82,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
                     // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                     match opt.split_once('=') {
                         Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
-                            self.tenant_id = Some(value.parse()?);
+                            self.tenant_id = Some(value.parse().with_context(|| {
+                                format!("Failed to parse {value} as tenant id")
+                            })?);
                         }
                         Some(("ztimelineid", value)) | Some(("timeline_id", value)) => {
-                            self.timeline_id = Some(value.parse()?);
+                            self.timeline_id = Some(value.parse().with_context(|| {
+                                format!("Failed to parse {value} as timeline id")
+                            })?);
                         }
                         _ => continue,
                     }
@@ -93,7 +102,9 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
 
             Ok(())
         } else {
-            bail!("Safekeeper received unexpected initial message: {:?}", sm);
+            Err(QueryError::Other(anyhow::anyhow!(
+                "Safekeeper received unexpected initial message: {sm:?}"
+            )))
         }
     }
 
@@ -101,7 +112,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         &mut self,
         _pgb: &mut PostgresBackend,
         jwt_response: &[u8],
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), QueryError> {
         // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
         // which requires auth to be present
         let data = self
@@ -109,13 +120,12 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
             .auth
             .as_ref()
             .unwrap()
-            .decode(str::from_utf8(jwt_response)?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
 
-        if matches!(data.claims.scope, Scope::Tenant) {
-            ensure!(
-                data.claims.tenant_id.is_some(),
+        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
+            return Err(QueryError::Other(anyhow::anyhow!(
                 "jwt token scope is Tenant, but tenant id is missing"
-            )
+            )));
         }
 
         info!(
@@ -127,7 +137,11 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         Ok(())
     }
 
-    fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> {
+    fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: &str,
+    ) -> Result<(), QueryError> {
         if query_string
             .to_ascii_lowercase()
             .starts_with("set datestyle to ")
@@ -148,19 +162,26 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
         self.check_permission(Some(tenant_id))?;
         self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
 
-        match cmd {
+        let res = match cmd {
             SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self),
             SafekeeperPostgresCommand::StartReplication { start_lsn } => {
                 ReplicationConn::new(pgb).run(self, pgb, start_lsn)
             }
             SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb),
             SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd),
-        }
-        .context(format!(
-            "Failed to process query for timeline {timeline_id}"
-        ))?;
+        };
 
-        Ok(())
+        match res {
+            Ok(()) => Ok(()),
+            Err(QueryError::Disconnected(connection_error)) => {
+                info!("Timeline {tenant_id}/{timeline_id} query failed with connection error: {connection_error}");
+                Err(QueryError::Disconnected(connection_error))
+            }
+            Err(QueryError::Other(e)) => Err(QueryError::Other(e.context(format!(
+                "Failed to process query for timeline {}",
+                self.ttid
+            )))),
+        }
     }
 }
 
@@ -178,7 +199,7 @@ impl SafekeeperPostgresHandler {
 
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
         if self.conf.auth.is_none() {
             // auth is set to Trust, nothing to check so just return ok
             return Ok(());
@@ -196,7 +217,7 @@ impl SafekeeperPostgresHandler {
     ///
     /// Handle IDENTIFY_SYSTEM replication command
     ///
-    fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> {
+    fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<(), QueryError> {
         let tli = GlobalTimelines::get(self.ttid)?;
 
         let lsn = if self.is_walproposer_recovery() {
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 746b4461b7..32a24a4978 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,11 +8,12 @@
 
 use std::sync::Arc;
 
-use anyhow::Result;
+use anyhow::Context;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use tracing::*;
 use utils::id::TenantTimelineId;
+use utils::postgres_backend_async::QueryError;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -47,7 +48,7 @@ pub struct AppendLogicalMessage {
     pg_version: u32,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 struct AppendResult {
     // safekeeper state after append
     state: SafeKeeperState,
@@ -62,8 +63,8 @@ pub fn handle_json_ctrl(
     spg: &SafekeeperPostgresHandler,
     pgb: &mut PostgresBackend,
     append_request: &AppendLogicalMessage,
-) -> Result<()> {
-    info!("JSON_CTRL request: {:?}", append_request);
+) -> Result<(), QueryError> {
+    info!("JSON_CTRL request: {append_request:?}");
 
     // need to init safekeeper state before AppendRequest
     let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?;
@@ -78,7 +79,8 @@ pub fn handle_json_ctrl(
         state: tli.get_state().1,
         inserted_wal,
     };
-    let response_data = serde_json::to_vec(&response)?;
+    let response_data = serde_json::to_vec(&response)
+        .with_context(|| format!("Response {response:?} is not a json array"))?;
 
     pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor {
         name: b"json",
@@ -93,7 +95,7 @@ pub fn handle_json_ctrl(
 
 /// Prepare safekeeper to process append requests without crashes,
 /// by sending ProposerGreeting with default server.wal_seg_size.
-fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Timeline>> {
+fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> anyhow::Result<Arc<Timeline>> {
     GlobalTimelines::create(
         ttid,
         ServerInfo {
@@ -106,7 +108,7 @@ fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result<Arc<Tim
     )
 }
 
-fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()> {
+fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
     // add new term to existing history
     let history = tli.get_state().1.acceptor_state.term_history;
     let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -125,7 +127,7 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()
     Ok(())
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 struct InsertedWAL {
     begin_lsn: Lsn,
     end_lsn: Lsn,
@@ -134,7 +136,10 @@ struct InsertedWAL {
 
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
-fn append_logical_message(tli: &Arc<Timeline>, msg: &AppendLogicalMessage) -> Result<InsertedWAL> {
+fn append_logical_message(
+    tli: &Arc<Timeline>,
+    msg: &AppendLogicalMessage,
+) -> anyhow::Result<InsertedWAL> {
     let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
     let sk_state = tli.get_state().1;
 
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index be7f071abb..671e5470a0 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -2,11 +2,13 @@
 //! Gets messages from the network, passes them down to consensus module and
 //! sends replies back.
 
-use anyhow::{anyhow, bail, Result};
+use anyhow::anyhow;
+use anyhow::Context;
 
 use bytes::BytesMut;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::postgres_backend_async::QueryError;
 
 use crate::safekeeper::ServerInfo;
 use crate::timeline::Timeline;
@@ -43,7 +45,7 @@ impl<'pg> ReceiveWalConn<'pg> {
     }
 
     // Send message to the postgres
-    fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> {
+    fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> anyhow::Result<()> {
         let mut buf = BytesMut::with_capacity(128);
         msg.serialize(&mut buf)?;
         self.pg_backend.write_message(&BeMessage::CopyData(&buf))?;
@@ -51,7 +53,7 @@ impl<'pg> ReceiveWalConn<'pg> {
     }
 
     /// Receive WAL from wal_proposer
-    pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> {
+    pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<(), QueryError> {
         let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered();
 
         // Notify the libpq client that it's allowed to send `CopyData` messages
@@ -79,7 +81,11 @@ impl<'pg> ReceiveWalConn<'pg> {
                 };
                 GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)?
             }
-            _ => bail!("unexpected message {:?} instead of greeting", next_msg),
+            _ => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message {next_msg:?} instead of greeting"
+                )))
+            }
         };
 
         let mut next_msg = Some(next_msg);
@@ -134,25 +140,32 @@ impl<'pg> ReceiveWalConn<'pg> {
 
 struct ProposerPollStream {
     msg_rx: Receiver<ProposerAcceptorMessage>,
-    read_thread: Option<thread::JoinHandle<Result<()>>>,
+    read_thread: Option<thread::JoinHandle<Result<(), QueryError>>>,
 }
 
 impl ProposerPollStream {
-    fn new(mut r: ReadStream) -> Result<Self> {
+    fn new(mut r: ReadStream) -> anyhow::Result<Self> {
         let (msg_tx, msg_rx) = channel();
 
         let read_thread = thread::Builder::new()
             .name("Read WAL thread".into())
-            .spawn(move || -> Result<()> {
+            .spawn(move || -> Result<(), QueryError> {
                 loop {
                     let copy_data = match FeMessage::read(&mut r)? {
-                        Some(FeMessage::CopyData(bytes)) => bytes,
-                        Some(msg) => bail!("expected `CopyData` message, found {:?}", msg),
-                        None => bail!("connection closed unexpectedly"),
-                    };
+                        Some(FeMessage::CopyData(bytes)) => Ok(bytes),
+                        Some(msg) => Err(QueryError::Other(anyhow::anyhow!(
+                            "expected `CopyData` message, found {msg:?}"
+                        ))),
+                        None => Err(QueryError::from(std::io::Error::new(
+                            std::io::ErrorKind::ConnectionAborted,
+                            "walproposer closed the connection",
+                        ))),
+                    }?;
 
                     let msg = ProposerAcceptorMessage::parse(copy_data)?;
-                    msg_tx.send(msg)?;
+                    msg_tx
+                        .send(msg)
+                        .context("Failed to send the proposer message")?;
                 }
                 // msg_tx will be dropped here, this will also close msg_rx
             })?;
@@ -163,17 +176,19 @@ impl ProposerPollStream {
         })
     }
 
-    fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage> {
+    fn recv_msg(&mut self) -> Result<ProposerAcceptorMessage, QueryError> {
         self.msg_rx.recv().map_err(|_| {
             // return error from the read thread
             let res = match self.read_thread.take() {
                 Some(thread) => thread.join(),
-                None => return anyhow!("read thread is gone"),
+                None => return QueryError::Other(anyhow::anyhow!("read thread is gone")),
             };
 
             match res {
-                Ok(Ok(())) => anyhow!("unexpected result from read thread"),
-                Err(err) => anyhow!("read thread panicked: {:?}", err),
+                Ok(Ok(())) => {
+                    QueryError::Other(anyhow::anyhow!("unexpected result from read thread"))
+                }
+                Err(err) => QueryError::Other(anyhow::anyhow!("read thread panicked: {err:?}")),
                 Ok(Err(err)) => err,
             }
         })
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index a054b8fe14..20600ab694 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::timeline::{ReplicaState, Timeline};
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
-use anyhow::{bail, Context, Result};
+use anyhow::Context;
 
 use bytes::Bytes;
 use postgres_ffi::get_current_timestamp;
@@ -15,7 +15,8 @@ use std::cmp::min;
 use std::net::Shutdown;
 use std::sync::Arc;
 use std::time::Duration;
-use std::{str, thread};
+use std::{io, str, thread};
+use utils::postgres_backend_async::QueryError;
 
 use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody};
 use tokio::sync::watch::Receiver;
@@ -91,7 +92,7 @@ impl ReplicationConn {
     fn background_thread(
         mut stream_in: ReadStream,
         replica_guard: Arc<ReplicationConnGuard>,
-    ) -> Result<()> {
+    ) -> anyhow::Result<()> {
         let replica_id = replica_guard.replica;
         let timeline = &replica_guard.timeline;
 
@@ -140,7 +141,7 @@ impl ReplicationConn {
                     // Shutdown the connection, because rust-postgres client cannot be dropped
                     // when connection is alive.
                     let _ = stream_in.shutdown(Shutdown::Both);
-                    bail!("Copy failed");
+                    anyhow::bail!("Copy failed");
                 }
                 _ => {
                     // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored.
@@ -160,7 +161,7 @@ impl ReplicationConn {
         spg: &mut SafekeeperPostgresHandler,
         pgb: &mut PostgresBackend,
         mut start_pos: Lsn,
-    ) -> Result<()> {
+    ) -> Result<(), QueryError> {
         let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered();
 
         let tli = GlobalTimelines::get(spg.ttid)?;
@@ -256,8 +257,10 @@ impl ReplicationConn {
                         // to right pageserver.
                         if tli.should_walsender_stop(replica_id) {
                             // Shut down, timeline is suspended.
-                            // TODO create proper error type for this
-                            bail!("end streaming to {:?}", spg.appname);
+                            return Err(QueryError::from(io::Error::new(
+                                io::ErrorKind::ConnectionAborted,
+                                format!("end streaming to {:?}", spg.appname),
+                            )));
                         }
 
                         // timeout expired: request pageserver status
@@ -265,8 +268,7 @@ impl ReplicationConn {
                             sent_ptr: end_pos.0,
                             timestamp: get_current_timestamp(),
                             request_reply: true,
-                        }))
-                        .context("Failed to send KeepAlive message")?;
+                        }))?;
                         continue;
                     }
                 }
@@ -301,7 +303,7 @@ impl ReplicationConn {
 const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
 
 // Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn.
-async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> Result<Option<Lsn>> {
+async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
     let commit_lsn: Lsn = *rx.borrow();
     if commit_lsn > lsn {
         return Ok(Some(commit_lsn));
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 0fea00fe1b..3ca651d060 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -2,18 +2,18 @@
 //!   WAL service listens for client connections and
 //!   receive WAL from wal_proposer and send it to WAL receivers
 //!
-use anyhow::Result;
 use regex::Regex;
 use std::net::{TcpListener, TcpStream};
 use std::thread;
 use tracing::*;
+use utils::postgres_backend_async::QueryError;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::SafeKeeperConf;
 use utils::postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> {
+pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> ! {
     loop {
         match listener.accept() {
             Ok((socket, peer_addr)) => {
@@ -44,7 +44,7 @@ fn get_tid() -> u64 {
 
 /// This is run by `thread_main` above, inside a background thread.
 ///
-fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> {
+fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> {
     let _enter = info_span!("", tid = ?get_tid()).entered();
 
     socket.set_nodelay(true)?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 705ab70ab4..eb15278ba7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1903,13 +1903,15 @@ class NeonPageserver(PgProtocol):
             ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
             ".*Shutdown task error: walreceiver connection handling failure.*",
             ".*wal_connection_manager.*tcp connect error: Connection refused.*",
-            ".*query handler for .* failed: Connection reset by peer.*",
-            ".*serving compute connection task.*exited with error: Broken pipe.*",
-            ".*Connection aborted: error communicating with the server: Broken pipe.*",
-            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
-            ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
+            ".*query handler for .* failed: Socket IO error: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Postgres connection error.*",
+            ".*serving compute connection task.*exited with error: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Postgres query error.*",
+            ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*",
+            ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*",
+            ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*",
             ".*kill_and_wait_impl.*: wait successful.*",
-            ".*end streaming to Some.*",
+            ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 77ec33f8b0..72d27c3aba 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1105,7 +1105,6 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     env.pageserver.allowed_errors.extend(
         [
             ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
-            ".*end streaming to Some.*",
         ]
     )
 

From efad64bc7feec51c23ec6bf3a6ea19797ebdb6a0 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 4 Jan 2023 12:45:11 +0200
Subject: [PATCH 17/42] Expect compute shutdown test log error (#3262)

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3261/debug/3833043374/index.html#suites/ffbb7f9930a77115316b58ff32b7c719/1f6ebaedc0a113a1/

Spotted a flacky test that appeared after
https://github.com/neondatabase/neon/pull/3227 changes
---
 test_runner/fixtures/neon_fixtures.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index eb15278ba7..ba2cce3022 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1913,6 +1913,7 @@ class NeonPageserver(PgProtocol):
             ".*kill_and_wait_impl.*: wait successful.*",
             ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
+            ".*query handler for 'pagestream.*failed: Connection reset by peer (os error 104).*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
             ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",

From 8932d14d505c1ecc04eeec32243397ffd03ffc1c Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Wed, 4 Jan 2023 17:31:51 +0200
Subject: [PATCH 18/42] Revert "Run Python tests in 8 threads (#3206)" (#3264)

This reverts commit 56a4466d0a85a9498bfd2a78a4ad3a2facb58167.

Seems that flackiness increased after this commit, while the time
decrease was a couple of seconds.
With every regular Python test spawing 1 etcd, 3 safekeepers, 1
pageserver, few CLI commands and post-run cleanup hooks, it might be
hard to run many such tests in parallel.

We could return to this later, after we consider alternative test
structure and/or CI runner structure.
---
 .github/actions/run-python-test-set/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 95167ecf6c..990c7e25a9 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n8 uses eight processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n8 $EXTRA_PARAMS"
+          # -n4 uses four processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist

From f436fb2dfb91292ae59bae7de3a6d41db100683e Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Tue, 3 Jan 2023 11:42:06 +0100
Subject: [PATCH 19/42] Fix panics at compute_ctl:monitor

---
 compute_tools/src/monitor.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 1588f5d62e..c871422e78 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -52,10 +52,16 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     let mut idle_backs: Vec<DateTime<Utc>> = vec![];
 
                     for b in backs.into_iter() {
-                        let state: String = b.get("state");
-                        let change: String = b.get("state_change");
+                        let state: String = match b.try_get("state") {
+                            Ok(state) => state,
+                            Err(_) => continue,
+                        };
 
                         if state == "idle" {
+                            let change: String = match b.try_get("state_change") {
+                                Ok(state_change) => state_change,
+                                Err(_) => continue,
+                            };
                             let change = DateTime::parse_from_rfc3339(&change);
                             match change {
                                 Ok(t) => idle_backs.push(t.with_timezone(&Utc)),

From 8c6e607327d17b98dd6635d5dc6036d28d1efc04 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 4 Jan 2023 23:03:16 +0200
Subject: [PATCH 20/42] Refactor send_tarball() (#3259)

The Basebackup struct is really just a convenient place to carry the
various parameters around in send_tarball and its subroutines. Make it
internal to the send_tarball function.
---
 pageserver/src/basebackup.rs   | 179 ++++++++++++++++++---------------
 pageserver/src/page_service.rs |   6 +-
 2 files changed, 99 insertions(+), 86 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 4052f13875..1978becf83 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,11 +10,10 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
 use std::fmt::Write as FmtWrite;
-use std::sync::Arc;
 use std::time::SystemTime;
 use tokio::io;
 use tokio::io::AsyncWrite;
@@ -39,114 +38,130 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;
 
+/// Create basebackup with non-rel data in it.
+/// Only include relational data if 'full_backup' is true.
+///
+/// Currently we use empty 'req_lsn' in two cases:
+///  * During the basebackup right after timeline creation
+///  * When working without safekeepers. In this situation it is important to match the lsn
+///    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
+///    to start the replication.
+pub async fn send_basebackup_tarball<'a, W>(
+    write: &'a mut W,
+    timeline: &'a Timeline,
+    req_lsn: Option<Lsn>,
+    prev_lsn: Option<Lsn>,
+    full_backup: bool,
+) -> anyhow::Result<()>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    // Compute postgres doesn't have any previous WAL files, but the first
+    // record that it's going to write needs to include the LSN of the
+    // previous record (xl_prev). We include prev_record_lsn in the
+    // "zenith.signal" file, so that postgres can read it during startup.
+    //
+    // We don't keep full history of record boundaries in the page server,
+    // however, only the predecessor of the latest record on each
+    // timeline. So we can only provide prev_record_lsn when you take a
+    // base backup at the end of the timeline, i.e. at last_record_lsn.
+    // Even at the end of the timeline, we sometimes don't have a valid
+    // prev_lsn value; that happens if the timeline was just branched from
+    // an old LSN and it doesn't have any WAL of its own yet. We will set
+    // prev_lsn to Lsn(0) if we cannot provide the correct value.
+    let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
+        // Backup was requested at a particular LSN. The caller should've
+        // already checked that it's a valid LSN.
+
+        // If the requested point is the end of the timeline, we can
+        // provide prev_lsn. (get_last_record_rlsn() might return it as
+        // zero, though, if no WAL has been generated on this timeline
+        // yet.)
+        let end_of_timeline = timeline.get_last_record_rlsn();
+        if req_lsn == end_of_timeline.last {
+            (end_of_timeline.prev, req_lsn)
+        } else {
+            (Lsn(0), req_lsn)
+        }
+    } else {
+        // Backup was requested at end of the timeline.
+        let end_of_timeline = timeline.get_last_record_rlsn();
+        (end_of_timeline.prev, end_of_timeline.last)
+    };
+
+    // Consolidate the derived and the provided prev_lsn values
+    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
+        if backup_prev != Lsn(0) {
+            ensure!(backup_prev == provided_prev_lsn);
+        }
+        provided_prev_lsn
+    } else {
+        backup_prev
+    };
+
+    info!(
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
+        backup_lsn, prev_lsn, full_backup
+    );
+
+    let basebackup = Basebackup {
+        ar: Builder::new_non_terminated(write),
+        timeline,
+        lsn: backup_lsn,
+        prev_record_lsn: prev_lsn,
+        full_backup,
+    };
+    basebackup
+        .send_tarball()
+        .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
+        .await
+}
+
 /// This is short-living object only for the time of tarball creation,
 /// created mostly to avoid passing a lot of parameters between various functions
 /// used for constructing tarball.
-pub struct Basebackup<'a, W>
+struct Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
     ar: Builder<&'a mut W>,
-    timeline: &'a Arc<Timeline>,
-    pub lsn: Lsn,
+    timeline: &'a Timeline,
+    lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
 }
 
-// Create basebackup with non-rel data in it.
-// Only include relational data if 'full_backup' is true.
-//
-// Currently we use empty lsn in two cases:
-//  * During the basebackup right after timeline creation
-//  * When working without safekeepers. In this situation it is important to match the lsn
-//    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
-//    to start the replication.
 impl<'a, W> Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
 {
-    pub fn new(
-        write: &'a mut W,
-        timeline: &'a Arc<Timeline>,
-        req_lsn: Option<Lsn>,
-        prev_lsn: Option<Lsn>,
-        full_backup: bool,
-    ) -> Result<Basebackup<'a, W>> {
-        // Compute postgres doesn't have any previous WAL files, but the first
-        // record that it's going to write needs to include the LSN of the
-        // previous record (xl_prev). We include prev_record_lsn in the
-        // "zenith.signal" file, so that postgres can read it during startup.
-        //
-        // We don't keep full history of record boundaries in the page server,
-        // however, only the predecessor of the latest record on each
-        // timeline. So we can only provide prev_record_lsn when you take a
-        // base backup at the end of the timeline, i.e. at last_record_lsn.
-        // Even at the end of the timeline, we sometimes don't have a valid
-        // prev_lsn value; that happens if the timeline was just branched from
-        // an old LSN and it doesn't have any WAL of its own yet. We will set
-        // prev_lsn to Lsn(0) if we cannot provide the correct value.
-        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
-            // Backup was requested at a particular LSN. The caller should've
-            // already checked that it's a valid LSN.
-
-            // If the requested point is the end of the timeline, we can
-            // provide prev_lsn. (get_last_record_rlsn() might return it as
-            // zero, though, if no WAL has been generated on this timeline
-            // yet.)
-            let end_of_timeline = timeline.get_last_record_rlsn();
-            if req_lsn == end_of_timeline.last {
-                (end_of_timeline.prev, req_lsn)
-            } else {
-                (Lsn(0), req_lsn)
-            }
-        } else {
-            // Backup was requested at end of the timeline.
-            let end_of_timeline = timeline.get_last_record_rlsn();
-            (end_of_timeline.prev, end_of_timeline.last)
-        };
-
-        // Consolidate the derived and the provided prev_lsn values
-        let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-            if backup_prev != Lsn(0) {
-                ensure!(backup_prev == provided_prev_lsn)
-            }
-            provided_prev_lsn
-        } else {
-            backup_prev
-        };
-
-        info!(
-            "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
-            backup_lsn, prev_lsn, full_backup
-        );
-
-        Ok(Basebackup {
-            ar: Builder::new_non_terminated(write),
-            timeline,
-            lsn: backup_lsn,
-            prev_record_lsn: prev_lsn,
-            full_backup,
-        })
-    }
-
-    pub async fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> anyhow::Result<()> {
         // TODO include checksum
 
         // Create pgdata subdirs structure
         for dir in PGDATA_SUBDIRS.iter() {
             let header = new_tar_header_dir(dir)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
+                .append(&header, &mut io::empty())
+                .await
+                .context("could not add directory to basebackup tarball")?;
         }
 
-        // Send empty config files.
+        // Send config files.
         for filepath in PGDATA_SPECIAL_FILES.iter() {
             if *filepath == "pg_hba.conf" {
                 let data = PG_HBA.as_bytes();
                 let header = new_tar_header(filepath, data.len() as u64)?;
-                self.ar.append(&header, data).await?;
+                self.ar
+                    .append(&header, data)
+                    .await
+                    .context("could not add config file to basebackup tarball")?;
             } else {
                 let header = new_tar_header(filepath, 0)?;
-                self.ar.append(&header, &mut io::empty()).await?;
+                self.ar
+                    .append(&header, &mut io::empty())
+                    .await
+                    .context("could not add config file to basebackup tarball")?;
             }
         }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 4087a8f90c..b266a07337 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -654,10 +654,8 @@ impl PageServerHandler {
         /* Send a tarball of the latest layer on the timeline */
         {
             let mut writer = pgb.copyout_writer();
-            let basebackup =
-                basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
-            tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
-            basebackup.send_tarball().await?;
+            basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
+                .await?;
         }
 
         pgb.write_message(&BeMessage::CopyDone)?;

From 6a9d1030a687d6c4ebd415f702441f09d679fab4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Jan 2023 17:43:54 +0100
Subject: [PATCH 21/42] use RemoteTimelineClient for downloading index part
 during tenant_attach

Before this change, we would not .measure_remote_op for index part
downloads.

And more generally, it's good to pass not just uploads but also
downloads through RemoteTimelineClient, e.g., if we ever want to
implement some timeline-scoped policies there.

Found this while working on https://github.com/neondatabase/neon/pull/3250
where I add a metric to measure the degree of concurrent downloads.
Layer download was missing in a test that I added there.
---
 pageserver/src/tenant.rs                      | 91 +++++++++++++------
 .../src/tenant/remote_timeline_client.rs      |  6 +-
 .../tenant/remote_timeline_client/download.rs | 43 +++------
 3 files changed, 78 insertions(+), 62 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dcaa8ea268..72404e98cd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -13,11 +13,13 @@
 
 use anyhow::{bail, Context};
 use bytes::Bytes;
+use futures::FutureExt;
 use futures::Stream;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use tokio::sync::watch;
+use tokio::task::JoinSet;
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 
@@ -639,26 +641,62 @@ impl Tenant {
             .as_ref()
             .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?;
 
-        let remote_timelines = remote_timeline_client::list_remote_timelines(
+        let remote_timeline_ids = remote_timeline_client::list_remote_timelines(
             remote_storage,
             self.conf,
             self.tenant_id,
         )
         .await?;
 
-        info!("found {} timelines", remote_timelines.len());
+        info!("found {} timelines", remote_timeline_ids.len());
 
-        let mut timeline_ancestors: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        let mut index_parts: HashMap<TimelineId, IndexPart> = HashMap::new();
-        for (timeline_id, index_part) in remote_timelines {
-            let remote_metadata = index_part.parse_metadata().with_context(|| {
-                format!(
-                    "Failed to parse metadata file from remote storage for tenant {} timeline {}",
-                    self.tenant_id, timeline_id
-                )
-            })?;
+        // Download & parse index parts
+        let mut part_downloads = JoinSet::new();
+        for timeline_id in remote_timeline_ids {
+            let client = RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            );
+            part_downloads.spawn(
+                async move {
+                    debug!("starting index part download");
+
+                    let index_part = client
+                        .download_index_file()
+                        .await
+                        .context("download index file")?;
+
+                    let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
+
+                    debug!("finished index part download");
+
+                    Result::<_, anyhow::Error>::Ok((
+                        timeline_id,
+                        client,
+                        index_part,
+                        remote_metadata,
+                    ))
+                }
+                .map(move |res| {
+                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
+                })
+                .instrument(info_span!("download_index_part", timeline=%timeline_id)),
+            );
+        }
+        // Wait for all the download tasks to complete & collect results.
+        let mut remote_clients = HashMap::new();
+        let mut index_parts = HashMap::new();
+        let mut timeline_ancestors = HashMap::new();
+        while let Some(result) = part_downloads.join_next().await {
+            // NB: we already added timeline_id as context to the error
+            let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
+            let (timeline_id, client, index_part, remote_metadata) = result?;
+            debug!("successfully downloaded index part for timeline {timeline_id}");
             timeline_ancestors.insert(timeline_id, remote_metadata);
             index_parts.insert(timeline_id, index_part);
+            remote_clients.insert(timeline_id, client);
         }
 
         // For every timeline, download the metadata file, scan the local directory,
@@ -671,7 +709,7 @@ impl Tenant {
                 timeline_id,
                 index_parts.remove(&timeline_id).unwrap(),
                 remote_metadata,
-                remote_storage.clone(),
+                remote_clients.remove(&timeline_id).unwrap(),
             )
             .await
             .with_context(|| {
@@ -714,22 +752,19 @@ impl Tenant {
         Ok(size)
     }
 
-    #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))]
+    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
     async fn load_remote_timeline(
         &self,
         timeline_id: TimelineId,
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
-        remote_storage: GenericRemoteStorage,
+        remote_client: RemoteTimelineClient,
     ) -> anyhow::Result<()> {
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
             .await
             .context("Failed to create new timeline directory")?;
 
-        let remote_client =
-            RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?;
-
         let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
             let timelines = self.timelines.lock().unwrap();
             Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
@@ -986,18 +1021,14 @@ impl Tenant {
             None
         };
 
-        let remote_client = self
-            .remote_storage
-            .as_ref()
-            .map(|remote_storage| {
-                RemoteTimelineClient::new(
-                    remote_storage.clone(),
-                    self.conf,
-                    self.tenant_id,
-                    timeline_id,
-                )
-            })
-            .transpose()?;
+        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
+            RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            )
+        });
 
         let remote_startup_data = match &remote_client {
             Some(remote_client) => match remote_client.download_index_file().await {
@@ -2191,7 +2222,7 @@ impl Tenant {
                 self.conf,
                 tenant_id,
                 new_timeline_id,
-            )?;
+            );
             remote_client.init_upload_queue_for_empty_remote(&new_metadata)?;
             Some(remote_client)
         } else {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 45988ff47a..a9f19a4e1d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -298,8 +298,8 @@ impl RemoteTimelineClient {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-    ) -> anyhow::Result<RemoteTimelineClient> {
-        Ok(RemoteTimelineClient {
+    ) -> RemoteTimelineClient {
+        RemoteTimelineClient {
             conf,
             runtime: &BACKGROUND_RUNTIME,
             tenant_id,
@@ -307,7 +307,7 @@ impl RemoteTimelineClient {
             storage_impl: remote_storage,
             upload_queue: Mutex::new(UploadQueue::Uninitialized),
             metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
-        })
+        }
     }
 
     /// Initialize the upload queue for a remote storage that already received
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 422728d1f3..2e79698087 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -8,10 +8,9 @@ use std::future::Future;
 use std::path::Path;
 
 use anyhow::{anyhow, Context};
-use futures::stream::{FuturesUnordered, StreamExt};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{debug, error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};
 
 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
@@ -175,7 +174,7 @@ pub async fn list_remote_timelines<'a>(
     storage: &'a GenericRemoteStorage,
     conf: &'static PageServerConf,
     tenant_id: TenantId,
-) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
+) -> anyhow::Result<HashSet<TimelineId>> {
     let tenant_path = conf.timelines_path(&tenant_id);
     let tenant_storage_path = conf.remote_path(&tenant_path)?;
 
@@ -194,7 +193,6 @@ pub async fn list_remote_timelines<'a>(
     }
 
     let mut timeline_ids = HashSet::new();
-    let mut part_downloads = FuturesUnordered::new();
 
     for timeline_remote_storage_key in timelines {
         let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
@@ -205,35 +203,22 @@ pub async fn list_remote_timelines<'a>(
             format!("failed to parse object name into timeline id '{object_name}'")
         })?;
 
-        // list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
-        // yet, launch a download task for it.
-        if !timeline_ids.contains(&timeline_id) {
-            timeline_ids.insert(timeline_id);
-            let storage_clone = storage.clone();
-            part_downloads.push(async move {
-                (
-                    timeline_id,
-                    download_index_part(conf, &storage_clone, tenant_id, timeline_id)
-                        .instrument(info_span!("download_index_part", timeline=%timeline_id))
-                        .await,
-                )
-            });
-        }
+        // list_prefixes is assumed to return unique names. Ensure this here.
+        // NB: it's safer to bail out than warn-log this because the pageserver
+        //     needs to absolutely know about _all_ timelines that exist, so that
+        //     GC knows all the branchpoints. If we skipped over a timeline instead,
+        //     GC could delete a layer that's still needed by that timeline.
+        anyhow::ensure!(
+            !timeline_ids.contains(&timeline_id),
+            "list_prefixes contains duplicate timeline id {timeline_id}"
+        );
+        timeline_ids.insert(timeline_id);
     }
 
-    // Wait for all the download tasks to complete.
-    let mut timeline_parts = Vec::new();
-    while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
-        let index_part = part_upload_result
-            .with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
-
-        debug!("Successfully fetched index part for timeline {timeline_id}");
-        timeline_parts.push((timeline_id, index_part));
-    }
-    Ok(timeline_parts)
+    Ok(timeline_ids)
 }
 
-pub async fn download_index_part(
+pub(super) async fn download_index_part(
     conf: &'static PageServerConf,
     storage: &GenericRemoteStorage,
     tenant_id: TenantId,

From d7f1e301122f7c9f611165baa53b553318c52fcc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 5 Jan 2023 11:50:17 +0100
Subject: [PATCH 22/42] remote_timeline_client: more metrics & metrics-related
 cleanups

- Clean up redundant metric removal in TimelineMetrics::drop.
RemoteTimelineClientMetrics is responsible for cleaning up
REMOTE_OPERATION_TIME andREMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.

- Rename `pageserver_remote_upload_queue_unfinished_tasks` to
`pageserver_remote_timeline_client_calls_unfinished`. The new name
reflects that the metric is with respect to the entire call to remote
timeline client. This includes wait time in the upload queue and hence
it's a longer span than what `pageserver_remote_OPERATION_seconds`
measures.

- Add the `pageserver_remote_timeline_client_calls_started` histogram.
See the metric description for why we need it.

- Add helper functions `call_begin` etc to `RemoteTimelineClientMetrics`
to centralize the logic for updating the metrics above (they relate to
each other, see comments in code).

- Use these constructs to track ongoing downloads in
`pageserver_remote_timeline_client_calls_unfinished`

refs https://github.com/neondatabase/neon/issues/2029
fixes https://github.com/neondatabase/neon/issues/3249
closes https://github.com/neondatabase/neon/pull/3250
---
 pageserver/src/metrics.rs                     | 147 ++++++++++---
 .../src/tenant/remote_timeline_client.rs      |  83 +++++---
 test_runner/fixtures/metrics.py               |   7 +-
 test_runner/fixtures/neon_fixtures.py         |  28 +++
 test_runner/regress/test_remote_storage.py    | 194 ++++++++++++++++--
 5 files changed, 380 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 205ee0ffad..b61e64048b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -209,15 +209,34 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
 
 // remote storage metrics
 
-static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
+/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
+static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
-        "pageserver_remote_upload_queue_unfinished_tasks",
-        "Number of tasks in the upload queue that are not finished yet.",
+        "pageserver_remote_timeline_client_calls_unfinished",
+        "Number of ongoing calls to remote timeline client. \
+         Used to populate pageserver_remote_timeline_client_calls_started. \
+         This metric is not useful for sampling from Prometheus, but useful in tests.",
         &["tenant_id", "timeline_id", "file_kind", "op_kind"],
     )
     .expect("failed to define a metric")
 });
 
+static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_remote_timeline_client_calls_started",
+        "When calling a remote timeline client method, we record the current value \
+         of the calls_unfinished gauge in this histogram. Plot the histogram \
+         over time in a heatmap to visualize how many operations were ongoing \
+         at a given instant. It gives you a better idea of the queue depth \
+         than plotting the gauge directly, since operations may complete faster \
+         than the sampling interval.",
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
+        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -248,15 +267,12 @@ impl RemoteOpFileKind {
     }
 }
 
-pub static REMOTE_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
-pub static REMOTE_OPERATION_FILE_KINDS: &[&str] = &["layer", "index"];
-pub static REMOTE_OPERATION_STATUSES: &[&str] = &["success", "failure"];
-
 pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_remote_operation_seconds",
         "Time spent on remote storage operations. \
-        Grouped by tenant, timeline, operation_kind and status",
+        Grouped by tenant, timeline, operation_kind and status. \
+        Does not account for time spent waiting in remote timeline client's queues.",
         &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
     )
     .expect("failed to define a metric")
@@ -475,21 +491,6 @@ impl Drop for TimelineMetrics {
         for op in SMGR_QUERY_TIME_OPERATIONS {
             let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
         }
-
-        let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[tenant_id, timeline_id]);
-        for file_kind in REMOTE_OPERATION_FILE_KINDS {
-            for op in REMOTE_OPERATION_KINDS {
-                for status in REMOTE_OPERATION_STATUSES {
-                    let _ = REMOTE_OPERATION_TIME.remove_label_values(&[
-                        tenant_id,
-                        timeline_id,
-                        file_kind,
-                        op,
-                        status,
-                    ]);
-                }
-            }
-        }
     }
 }
 
@@ -510,7 +511,8 @@ pub struct RemoteTimelineClientMetrics {
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
     remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
-    unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
 }
 
 impl RemoteTimelineClientMetrics {
@@ -519,7 +521,8 @@ impl RemoteTimelineClientMetrics {
             tenant_id: tenant_id.to_string(),
             timeline_id: timeline_id.to_string(),
             remote_operation_time: Mutex::new(HashMap::default()),
-            unfinished_tasks: Mutex::new(HashMap::default()),
+            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls_started_hist: Mutex::new(HashMap::default()),
             remote_physical_size_gauge: Mutex::new(None),
         }
     }
@@ -558,16 +561,37 @@ impl RemoteTimelineClientMetrics {
         });
         metric.clone()
     }
-    pub fn unfinished_tasks(
+    fn calls_unfinished_gauge(
         &self,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntGauge {
         // XXX would be nice to have an upgradable RwLock
-        let mut guard = self.unfinished_tasks.lock().unwrap();
+        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
+            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
+                .get_metric_with_label_values(&[
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
+                    key.0,
+                    key.1,
+                ])
+                .unwrap()
+        });
+        metric.clone()
+    }
+
+    fn calls_started_hist(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> Histogram {
+        // XXX would be nice to have an upgradable RwLock
+        let mut guard = self.calls_started_hist.lock().unwrap();
+        let key = (file_kind.as_str(), op_kind.as_str());
+        let metric = guard.entry(key).or_insert_with(move || {
+            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
                 .get_metric_with_label_values(&[
                     &self.tenant_id.to_string(),
                     &self.timeline_id.to_string(),
@@ -580,6 +604,58 @@ impl RemoteTimelineClientMetrics {
     }
 }
 
+/// See [`RemoteTimelineClientMetrics::call_begin`].
+#[must_use]
+pub(crate) struct RemoteTimelineClientCallMetricGuard(Option<IntGauge>);
+
+impl RemoteTimelineClientCallMetricGuard {
+    /// Consume this guard object without decrementing the metric.
+    /// The caller vouches to do this manually, so that the prior increment of the gauge will cancel out.
+    pub fn will_decrement_manually(mut self) {
+        self.0 = None; // prevent drop() from decrementing
+    }
+}
+
+impl Drop for RemoteTimelineClientCallMetricGuard {
+    fn drop(&mut self) {
+        if let RemoteTimelineClientCallMetricGuard(Some(guard)) = self {
+            guard.dec();
+        }
+    }
+}
+
+impl RemoteTimelineClientMetrics {
+    /// Increment the metrics that track ongoing calls to the remote timeline client instance.
+    ///
+    /// Drop the returned guard object once the operation is finished to decrement the values.
+    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
+    /// is more suitable.
+    /// Never do both.
+    pub(crate) fn call_begin(
+        &self,
+        file_kind: &RemoteOpFileKind,
+        op_kind: &RemoteOpKind,
+    ) -> RemoteTimelineClientCallMetricGuard {
+        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        self.calls_started_hist(file_kind, op_kind)
+            .observe(unfinished_metric.get() as f64);
+        unfinished_metric.inc();
+        RemoteTimelineClientCallMetricGuard(Some(unfinished_metric))
+    }
+
+    /// Manually decrement the metric instead of using the guard object.
+    /// Using the guard object is generally preferable.
+    /// See [`call_begin`] for more context.
+    pub(crate) fn call_end(&self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind) {
+        let unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
+        debug_assert!(
+            unfinished_metric.get() > 0,
+            "begin and end should cancel out"
+        );
+        unfinished_metric.dec();
+    }
+}
+
 impl Drop for RemoteTimelineClientMetrics {
     fn drop(&mut self) {
         let RemoteTimelineClientMetrics {
@@ -587,13 +663,22 @@ impl Drop for RemoteTimelineClientMetrics {
             timeline_id,
             remote_physical_size_gauge,
             remote_operation_time,
-            unfinished_tasks,
+            calls_unfinished_gauge,
+            calls_started_hist,
         } = self;
         for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
             let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
         }
-        for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
-            let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
+        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                a,
+                b,
+            ]);
+        }
+        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
+            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
                 tenant_id,
                 timeline_id,
                 a,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index a9f19a4e1d..1db69d8b73 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -367,6 +367,10 @@ impl RemoteTimelineClient {
 
     /// Download index file
     pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self
+            .metrics
+            .call_begin(&RemoteOpFileKind::Index, &RemoteOpKind::Download);
+
         download::download_index_part(
             self.conf,
             &self.storage_impl,
@@ -393,22 +397,27 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerFileName,
         layer_metadata: &LayerFileMetadata,
     ) -> anyhow::Result<u64> {
-        let downloaded_size = download::download_layer_file(
-            self.conf,
-            &self.storage_impl,
-            self.tenant_id,
-            self.timeline_id,
-            layer_file_name,
-            layer_metadata,
-        )
-        .measure_remote_op(
-            self.tenant_id,
-            self.timeline_id,
-            RemoteOpFileKind::Layer,
-            RemoteOpKind::Download,
-            Arc::clone(&self.metrics),
-        )
-        .await?;
+        let downloaded_size = {
+            let _unfinished_gauge_guard = self
+                .metrics
+                .call_begin(&RemoteOpFileKind::Layer, &RemoteOpKind::Download);
+            download::download_layer_file(
+                self.conf,
+                &self.storage_impl,
+                self.tenant_id,
+                self.timeline_id,
+                layer_file_name,
+                layer_metadata,
+            )
+            .measure_remote_op(
+                self.tenant_id,
+                self.timeline_id,
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Download,
+                Arc::clone(&self.metrics),
+            )
+            .await?
+        };
 
         // Update the metadata for given layer file. The remote index file
         // might be missing some information for the file; this allows us
@@ -517,7 +526,7 @@ impl RemoteTimelineClient {
             metadata_bytes,
         );
         let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-        self.update_upload_queue_unfinished_metric(1, &op);
+        self.calls_unfinished_metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
 
@@ -549,7 +558,7 @@ impl RemoteTimelineClient {
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
-        self.update_upload_queue_unfinished_metric(1, &op);
+        self.calls_unfinished_metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
 
         info!(
@@ -601,7 +610,7 @@ impl RemoteTimelineClient {
             // schedule the actual deletions
             for name in names {
                 let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
-                self.update_upload_queue_unfinished_metric(1, &op);
+                self.calls_unfinished_metric_begin(&op);
                 upload_queue.queued_operations.push_back(op);
                 info!("scheduled layer file deletion {}", name.file_name());
             }
@@ -753,7 +762,7 @@ impl RemoteTimelineClient {
             // upload finishes or times out soon enough.
             if task_mgr::is_shutdown_requested() {
                 info!("upload task cancelled by shutdown request");
-                self.update_upload_queue_unfinished_metric(-1, &task.op);
+                self.calls_unfinished_metric_end(&task.op);
                 self.stop();
                 return;
             }
@@ -901,22 +910,40 @@ impl RemoteTimelineClient {
             // Launch any queued tasks that were unblocked by this one.
             self.launch_queued_tasks(upload_queue);
         }
-        self.update_upload_queue_unfinished_metric(-1, &task.op);
+        self.calls_unfinished_metric_end(&task.op);
     }
 
-    fn update_upload_queue_unfinished_metric(&self, delta: i64, op: &UploadOp) {
-        let (file_kind, op_kind) = match op {
+    fn calls_unfinished_metric_impl(
+        &self,
+        op: &UploadOp,
+    ) -> Option<(RemoteOpFileKind, RemoteOpKind)> {
+        let res = match op {
             UploadOp::UploadLayer(_, _) => (RemoteOpFileKind::Layer, RemoteOpKind::Upload),
             UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
             UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
             UploadOp::Barrier(_) => {
                 // we do not account these
-                return;
+                return None;
             }
         };
-        self.metrics
-            .unfinished_tasks(&file_kind, &op_kind)
-            .add(delta)
+        Some(res)
+    }
+
+    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        let guard = self.metrics.call_begin(&file_kind, &op_kind);
+        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
+    }
+
+    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind) = match self.calls_unfinished_metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        self.metrics.call_end(&file_kind, &op_kind);
     }
 
     fn stop(&self) {
@@ -967,7 +994,7 @@ impl RemoteTimelineClient {
 
                 // Tear down queued ops
                 for op in qi.queued_operations.into_iter() {
-                    self.update_upload_queue_unfinished_metric(-1, &op);
+                    self.calls_unfinished_metric_end(&op);
                     // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                     // which is exactly what we want to happen.
                     drop(op);
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 9236137d19..8b78e06c22 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -40,10 +40,9 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
 
 
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
-    "pageserver_remote_upload_queue_unfinished_tasks",
-    "pageserver_remote_operation_seconds_bucket",
-    "pageserver_remote_operation_seconds_count",
-    "pageserver_remote_operation_seconds_sum",
+    "pageserver_remote_timeline_client_calls_unfinished",
+    *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
+    *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
     "pageserver_remote_physical_size",
 )
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ba2cce3022..481f46ff55 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -34,6 +34,7 @@ from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -1409,6 +1410,33 @@ class PageserverHttpClient(requests.Session):
         ]
         return sample.value
 
+    def get_remote_timeline_client_metric(
+        self,
+        metric_name: str,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        file_kind: str,
+        op_kind: str,
+    ) -> Optional[float]:
+        metrics = parse_metrics(self.get_metrics(), "pageserver")
+        matches = metrics.query_all(
+            name=metric_name,
+            filter={
+                "tenant_id": str(tenant_id),
+                "timeline_id": str(timeline_id),
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        )
+        if len(matches) == 0:
+            value = None
+        elif len(matches) == 1:
+            value = matches[0].value
+            assert value is not None
+        else:
+            assert len(matches) < 2, "above filter should uniquely identify metric"
+        return value
+
     def get_metric_value(self, name: str) -> Optional[str]:
         metrics = self.get_metrics()
         relevant = [line for line in metrics.splitlines() if line.startswith(name)]
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 32c25b2e8c..82bf741a8f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,11 +2,11 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
 import os
-import re
 import shutil
 import threading
 import time
 from pathlib import Path
+from typing import Dict, List, Tuple
 
 import pytest
 from fixtures.log_helper import log
@@ -271,14 +271,15 @@ def test_remote_storage_upload_queue_retries(
         wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
-        metrics = client.get_metrics()
-        matches = re.search(
-            f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
-            metrics,
-            re.MULTILINE,
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
         )
-        assert matches
-        return int(matches[1])
+        assert val is not None, "expecting metric to be present"
+        return int(val)
 
     # create some layers & wait for uploads to finish
     overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a")
@@ -368,6 +369,168 @@ def test_remote_storage_upload_queue_retries(
         assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
 
 
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_remote_timeline_client_calls_started_metric(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_timeline_client_metrics",
+    )
+
+    env = neon_env_builder.init_start()
+
+    # create tenant with config that will determinstically allow
+    # compaction and gc
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": f"{128 * 1024}",
+            "compaction_threshold": "1",
+            "compaction_target_size": f"{128 * 1024}",
+            # no PITR horizon, we specify the horizon when we request on-demand GC
+            "pitr_interval": "0s",
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # don't create image layers, that causes just noise
+            "image_creation_threshold": "10000",
+        }
+    )
+
+    client = env.pageserver.http_client()
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    pg.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+
+    def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
+        # create initial set of layers & upload them with failpoints configured
+        pg.safe_psql_many(
+            [
+                f"""
+               INSERT INTO foo (id, val)
+               SELECT g, '{data}'
+               FROM generate_series(1, 10000) g
+               ON CONFLICT (id) DO UPDATE
+               SET val = EXCLUDED.val
+               """,
+                # to ensure that GC can actually remove some layers
+                "VACUUM foo",
+            ]
+        )
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+
+    def get_queued_count(file_kind, op_kind):
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
+        )
+        if val is None:
+            return val
+        return int(val)
+
+    def wait_upload_queue_empty():
+        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
+        wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
+        wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
+
+    calls_started: Dict[Tuple[str, str], List[int]] = {
+        ("layer", "upload"): [0],
+        ("index", "upload"): [0],
+        ("layer", "delete"): [0],
+    }
+
+    def fetch_calls_started():
+        for (file_kind, op_kind), observations in calls_started.items():
+            val = client.get_remote_timeline_client_metric(
+                "pageserver_remote_timeline_client_calls_started_count",
+                tenant_id,
+                timeline_id,
+                file_kind,
+                op_kind,
+            )
+            assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
+            val = int(val)
+            observations.append(val)
+
+    def ensure_calls_started_grew():
+        for (file_kind, op_kind), observations in calls_started.items():
+            log.info(f"ensure_calls_started_grew: {file_kind} {op_kind}: {observations}")
+            assert all(
+                x < y for x, y in zip(observations, observations[1:])
+            ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
+
+    def churn(data_pass1, data_pass2):
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_compact(tenant_id, timeline_id)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id)
+        client.timeline_compact(tenant_id, timeline_id)
+        gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+    # create some layers & wait for uploads to finish
+    churn("a", "b")
+
+    wait_upload_queue_empty()
+
+    # ensure that we updated the calls_started metric
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+    # more churn to cause more operations
+    churn("c", "d")
+
+    # ensure that the calls_started metric continued to be updated
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+    ### now we exercise the download path
+    calls_started.clear()
+    calls_started.update(
+        {
+            ("index", "download"): [0],
+            ("layer", "download"): [0],
+        }
+    )
+
+    env.pageserver.stop(immediate=True)
+    env.postgres.stop_all()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+    client = env.pageserver.http_client()
+
+    client.tenant_attach(tenant_id)
+
+    def tenant_active():
+        all_states = client.tenant_list()
+        [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
+        assert tenant["state"] == "Active"
+
+    wait_until(30, 1, tenant_active)
+
+    log.info("restarting postgres to validate")
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+    with pg.cursor() as cur:
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+
+    # ensure that we updated the calls_started download metric
+    fetch_calls_started()
+    ensure_calls_started_grew()
+
+
 # Test that we correctly handle timeline with layers stuck in upload queue
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_timeline_deletion_with_files_stuck_in_upload_queue(
@@ -401,15 +564,14 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     client = env.pageserver.http_client()
 
     def get_queued_count(file_kind, op_kind):
-        metrics = client.get_metrics()
-        matches = re.search(
-            f'^pageserver_remote_upload_queue_unfinished_tasks{{file_kind="{file_kind}",op_kind="{op_kind}",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}} (\\S+)$',
-            metrics,
-            re.MULTILINE,
+        val = client.get_remote_timeline_client_metric(
+            "pageserver_remote_timeline_client_calls_unfinished",
+            tenant_id,
+            timeline_id,
+            file_kind,
+            op_kind,
         )
-        if matches is None:
-            return None
-        return int(matches[1])
+        return int(val) if val is not None else val
 
     pg = env.postgres.create_start("main", tenant_id=tenant_id)
 

From 8712e1899e89ab0fd91296371bf4c3ad8c2bf8e8 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 5 Jan 2023 17:48:27 +0200
Subject: [PATCH 23/42] Move initial timeline creation into pytest (#3270)

For every Python test, we start the storage first, and expect that
later, in the test, when we start a compute, it will work without
specific timeline and tenant creation or their IDs specified.

For that, we have a concept of "default" branch that was created on the
control plane level first, but that's not needed at all, given that it's
only Python tests that need it: let them create the initial timeline
during set-up.

Before, control plane started and stopped pageserver for timeline
creation, now Python harness runs an extra tenant creation request on
test env init.

I had to adjust the metrics test, turns out it registered the metrics
from the default tenant after an extra pageserver restart.
New model does not sent the metrics before the collection time happens,
and that was 30s before.
---
 control_plane/src/background_process.rs       | 16 ----
 control_plane/src/bin/neon_local.rs           | 23 +-----
 control_plane/src/pageserver.rs               | 76 +------------------
 libs/utils/src/postgres_backend_async.rs      |  5 +-
 test_runner/fixtures/neon_fixtures.py         | 26 ++++---
 test_runner/regress/test_metric_collection.py | 71 ++++++++---------
 test_runner/regress/test_neon_local_cli.py    |  2 +-
 test_runner/regress/test_recovery.py          |  4 +-
 8 files changed, 66 insertions(+), 157 deletions(-)

diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 8909e27c94..1f3f8f45ea 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -136,22 +136,6 @@ where
     anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }
 
-/// Send SIGTERM to child process
-pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
-    let pid = child.id();
-    match kill(
-        nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
-        Signal::SIGTERM,
-    ) {
-        Ok(()) => Ok(()),
-        Err(Errno::ESRCH) => {
-            println!("child process with pid {pid} does not exist");
-            Ok(())
-        }
-        Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
-    }
-}
-
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
 pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
     let pid = match pid_file::read(pid_file)
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 71de741640..e4d0680c9e 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -284,8 +284,6 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }
 
 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let initial_timeline_id_arg = parse_timeline_id(init_match)?;
-
     // Create config file
     let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
         // load and parse the file
@@ -309,30 +307,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
         LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
     env.init(pg_version)
         .context("Failed to initialize neon repository")?;
-    let initial_tenant_id = env
-        .default_tenant_id
-        .expect("default_tenant_id should be generated by the `env.init()` call above");
 
     // Initialize pageserver, create initial tenant and timeline.
     let pageserver = PageServerNode::from_env(&env);
-    let initial_timeline_id = pageserver
-        .initialize(
-            Some(initial_tenant_id),
-            initial_timeline_id_arg,
-            &pageserver_config_overrides(init_match),
-            pg_version,
-        )
+    pageserver
+        .initialize(&pageserver_config_overrides(init_match))
         .unwrap_or_else(|e| {
             eprintln!("pageserver init failed: {e:?}");
             exit(1);
         });
 
-    env.register_branch_mapping(
-        DEFAULT_BRANCH_NAME.to_owned(),
-        initial_tenant_id,
-        initial_timeline_id,
-    )?;
-
     Ok(env)
 }
 
@@ -928,9 +912,8 @@ fn cli() -> Command {
         .version(GIT_VERSION)
         .subcommand(
             Command::new("init")
-                .about("Initialize a new Neon repository")
+                .about("Initialize a new Neon repository, preparing configs for services to start with")
                 .arg(pageserver_config_args.clone())
-                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                 .arg(
                     Arg::new("config")
                         .long("config")
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 68e94b2fdc..9cebe028e4 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -7,7 +7,7 @@ use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::{io, result};
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 use pageserver_api::models::{
     TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
@@ -130,83 +130,15 @@ impl PageServerNode {
         overrides
     }
 
-    /// Initializes a pageserver node by creating its config with the overrides provided,
-    /// and creating an initial tenant and timeline afterwards.
-    pub fn initialize(
-        &self,
-        create_tenant: Option<TenantId>,
-        initial_timeline_id: Option<TimelineId>,
-        config_overrides: &[&str],
-        pg_version: u32,
-    ) -> anyhow::Result<TimelineId> {
+    /// Initializes a pageserver node by creating its config with the overrides provided.
+    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
         // First, run `pageserver --init` and wait for it to write a config into FS and exit.
         self.pageserver_init(config_overrides).with_context(|| {
             format!(
                 "Failed to run init for pageserver node {}",
                 self.env.pageserver.id,
             )
-        })?;
-
-        // Then, briefly start it fully to run HTTP commands on it,
-        // to create initial tenant and timeline.
-        // We disable the remote storage, since we stop pageserver right after the timeline creation,
-        // hence most of the uploads will either aborted or not started: no point to start them at all.
-        let disabled_remote_storage_override = "remote_storage={}";
-        let mut pageserver_process = self
-            .start_node(
-                &[disabled_remote_storage_override],
-                // Previous overrides will be taken from the config created before, don't overwrite them.
-                false,
-            )
-            .with_context(|| {
-                format!(
-                    "Failed to start a process for pageserver node {}",
-                    self.env.pageserver.id,
-                )
-            })?;
-
-        let init_result = self
-            .try_init_timeline(create_tenant, initial_timeline_id, pg_version)
-            .context("Failed to create initial tenant and timeline for pageserver");
-        match &init_result {
-            Ok(initial_timeline_id) => {
-                println!("Successfully initialized timeline {initial_timeline_id}")
-            }
-            Err(e) => eprintln!("{e:#}"),
-        }
-        background_process::send_stop_child_process(&pageserver_process)?;
-
-        let exit_code = pageserver_process.wait()?;
-        ensure!(
-            exit_code.success(),
-            format!(
-                "pageserver init failed with exit code {:?}",
-                exit_code.code()
-            )
-        );
-        println!(
-            "Stopped pageserver {} process with pid {}",
-            self.env.pageserver.id,
-            pageserver_process.id(),
-        );
-        init_result
-    }
-
-    fn try_init_timeline(
-        &self,
-        new_tenant_id: Option<TenantId>,
-        new_timeline_id: Option<TimelineId>,
-        pg_version: u32,
-    ) -> anyhow::Result<TimelineId> {
-        let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
-        let initial_timeline_info = self.timeline_create(
-            initial_tenant_id,
-            new_timeline_id,
-            None,
-            None,
-            Some(pg_version),
-        )?;
-        Ok(initial_timeline_info.timeline_id)
+        })
     }
 
     pub fn repo_path(&self) -> PathBuf {
diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs
index a4f523da04..95b7b3fd15 100644
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -20,7 +20,10 @@ use tokio_rustls::TlsAcceptor;
 
 pub fn is_expected_io_error(e: &io::Error) -> bool {
     use io::ErrorKind::*;
-    matches!(e.kind(), ConnectionRefused | ConnectionAborted)
+    matches!(
+        e.kind(),
+        ConnectionRefused | ConnectionAborted | ConnectionReset
+    )
 }
 
 /// An error, occurred during query processing:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 481f46ff55..97bc694543 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -596,6 +596,7 @@ class NeonEnvBuilder:
         rust_log_override: Optional[str] = None,
         default_branch_name: str = DEFAULT_BRANCH_NAME,
         preserve_database_files: bool = False,
+        initial_tenant: Optional[TenantId] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -618,8 +619,9 @@ class NeonEnvBuilder:
         self.pg_distrib_dir = pg_distrib_dir
         self.pg_version = pg_version
         self.preserve_database_files = preserve_database_files
+        self.initial_tenant = initial_tenant or TenantId.generate()
 
-    def init(self) -> NeonEnv:
+    def init_configs(self) -> NeonEnv:
         # Cannot create more than one environment from one builder
         assert self.env is None, "environment already initialized"
         self.env = NeonEnv(self)
@@ -630,8 +632,17 @@ class NeonEnvBuilder:
         self.env.start()
 
     def init_start(self) -> NeonEnv:
-        env = self.init()
+        env = self.init_configs()
         self.start()
+
+        # Prepare the default branch to start the postgres on later.
+        # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
+        log.info(
+            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
+        )
+        initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
+
         return env
 
     def enable_remote_storage(
@@ -890,12 +901,12 @@ class NeonEnv:
 
         # generate initial tenant ID here instead of letting 'neon init' generate it,
         # so that we don't need to dig it out of the config file afterwards.
-        self.initial_tenant = TenantId.generate()
+        self.initial_tenant = config.initial_tenant
 
         # Create a config file corresponding to the options
         toml = textwrap.dedent(
             f"""
-            default_tenant_id = '{self.initial_tenant}'
+            default_tenant_id = '{config.initial_tenant}'
         """
         )
 
@@ -1724,17 +1735,12 @@ class NeonCli(AbstractNeonCli):
     def init(
         self,
         config_toml: str,
-        initial_timeline_id: Optional[TimelineId] = None,
     ) -> "subprocess.CompletedProcess[str]":
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
             tmp.write(config_toml)
             tmp.flush()
 
-            cmd = ["init", f"--config={tmp.name}"]
-            if initial_timeline_id:
-                cmd.extend(["--timeline-id", str(initial_timeline_id)])
-
-            cmd.extend(["--pg-version", self.env.pg_version])
+            cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
 
             append_pageserver_param_overrides(
                 params_to_update=cmd,
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 0fff86f268..d1fcab7a62 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -1,3 +1,5 @@
+import time
+
 import pytest
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
@@ -20,9 +22,19 @@ def httpserver_listen_address(port_distributor: PortDistributor):
     return ("localhost", port)
 
 
-num_metrics_received = 0
+initial_tenant = TenantId.generate()
 remote_uploaded = 0
-first_request = True
+checks = {
+    "written_size": lambda value: value > 0,
+    "resident_size": lambda value: value >= 0,
+    # >= 0 check here is to avoid race condition when we receive metrics before
+    # remote_uploaded is updated
+    "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+    # logical size may lag behind the actual size, so allow 0 here
+    "timeline_logical_size": lambda value: value >= 0,
+}
+
+metric_kinds_checked = set([])
 
 
 #
@@ -36,38 +48,19 @@ def metrics_handler(request: Request) -> Response:
     log.info("received events:")
     log.info(events)
 
-    checks = {
-        "written_size": lambda value: value > 0,
-        "resident_size": lambda value: value >= 0,
-        # >= 0 check here is to avoid race condition when we receive metrics before
-        # remote_uploaded is updated
-        "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
-        # logical size may lag behind the actual size, so allow 0 here
-        "timeline_logical_size": lambda value: value >= 0,
-    }
-
-    events_received = 0
     for event in events:
-        check = checks.get(event["metric"])
+        assert event["tenant_id"] == str(
+            initial_tenant
+        ), "Expecting metrics only from the initial tenant"
+        metric_name = event["metric"]
+
+        check = checks.get(metric_name)
         # calm down mypy
         if check is not None:
-            assert check(event["value"]), f"{event['metric']} isn't valid"
-            events_received += 1
+            assert check(event["value"]), f"{metric_name} isn't valid"
+            global metric_kinds_checked
+            metric_kinds_checked.add(metric_name)
 
-    global first_request
-    # check that all checks were sent
-    # but only on the first request, because we don't send non-changed metrics
-    if first_request:
-        # we may receive more metrics than we check,
-        # because there are two timelines
-        # and we may receive per-timeline metrics from both
-        # if the test was slow enough for these metrics to be collected
-        # -1 because that is ok to not receive timeline_logical_size
-        assert events_received >= len(checks) - 1
-        first_request = False
-
-    global num_metrics_received
-    num_metrics_received += 1
     return Response(status=200)
 
 
@@ -83,11 +76,14 @@ def test_metric_collection(
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
 
+    # Require collecting metrics frequently, since we change
+    # the timeline and want something to be logged about it.
+    #
     # Disable time-based pitr, we will use the manual GC calls
     # to trigger remote storage operations in a controlled way
     neon_env_builder.pageserver_config_override = (
         f"""
-        metric_collection_interval="60s"
+        metric_collection_interval="1s"
         metric_collection_endpoint="{metric_collection_endpoint}"
     """
         + "tenant_config={pitr_interval = '0 sec'}"
@@ -100,6 +96,9 @@ def test_metric_collection(
 
     log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
 
+    # Set initial tenant of the test, that we expect the logs from
+    global initial_tenant
+    initial_tenant = neon_env_builder.initial_tenant
     # mock http server that returns OK for the metrics
     httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
         metrics_handler
@@ -154,7 +153,11 @@ def test_metric_collection(
         remote_uploaded = get_num_remote_ops("index", "upload")
         assert remote_uploaded > 0
 
-    # check that all requests are served
+    # wait longer than collecting interval and check that all requests are served
+    time.sleep(3)
     httpserver.check()
-    global num_metrics_received
-    assert num_metrics_received > 0, "no metrics were received"
+    global metric_kinds_checked, checks
+    expected_checks = set(checks.keys())
+    assert len(metric_kinds_checked) == len(
+        checks
+    ), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered"
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 6c7cdb6f7f..e8f01ccf55 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -4,7 +4,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 # Test that neon cli is able to start and stop all processes with the user defaults.
 # def test_neon_cli_basics(neon_simple_env: NeonEnv):
 def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init()
+    env = neon_env_builder.init_configs()
 
     env.neon_cli.start()
     env.neon_cli.stop()
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 1e93958e98..09644eaaa1 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -12,11 +12,9 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     # Override default checkpointer settings to run it more often
     neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
 
-    env = neon_env_builder.init()
+    env = neon_env_builder.init_start()
     env.pageserver.is_testing_enabled_or_skip()
 
-    neon_env_builder.start()
-
     # These warnings are expected, when the pageserver is restarted abruptly
     env.pageserver.allowed_errors.append(".*found future delta layer.*")
     env.pageserver.allowed_errors.append(".*found future image layer.*")

From c187de1101744c57af041427199eab9630643951 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 5 Jan 2023 14:00:27 +0200
Subject: [PATCH 24/42] Copy error message before it's freed.

pageserver_disconnect() call invalidates 'pageserver_conn', including
the error message pointer we got from PQerrorMessage(pageserver_conn).
Copy the message to a temporary variable before disconnecting, like
we do in a few other places.

In the passing, clear 'pageserver_conn_wes' variable in a few places
where it was free'd. I didn't see any live bug from this, but since
pageserver_disconnect() checks if it's NULL, let's not leave it
dangling to already-free'd memory.
---
 pgxn/neon/libpagestore.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index c6199dddc0..0760842627 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -111,6 +111,7 @@ pageserver_connect()
 				PQfinish(pageserver_conn);
 				pageserver_conn = NULL;
 				FreeWaitEventSet(pageserver_conn_wes);
+				pageserver_conn_wes = NULL;
 
 				neon_log(ERROR, "could not complete handshake with pageserver: %s",
 						 msg);
@@ -179,7 +180,10 @@ pageserver_disconnect(void)
 		prefetch_on_ps_disconnect();
 	}
 	if (pageserver_conn_wes != NULL)
+	{
 		FreeWaitEventSet(pageserver_conn_wes);
+		pageserver_conn_wes = NULL;
+	}
 }
 
 static void
@@ -206,7 +210,7 @@ pageserver_send(NeonRequest * request)
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
-		char	   *msg = PQerrorMessage(pageserver_conn);
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		pageserver_disconnect();
 		neon_log(ERROR, "failed to send page request: %s", msg);

From 8b710b9753846515bc1d2dddd0154dcfcf1beaf9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 5 Jan 2023 14:45:28 +0200
Subject: [PATCH 25/42] Fix segfault if pageserver connection is lost during
 backend startup.

It's not OK to return early from within a PG_TRY-CATCH block. The
PG_TRY macro sets the global PG_exception_stack variable, and
PG_END_TRY restores it. If we jump out in between with "return NULL",
the PG_exception_stack is left to point to garbage. (I'm surprised the
comments in PG_TRY_CATCH don't warn about this.)

Add test that re-attaches tenant in pageserver while Postgres is
running. If the tenant is detached while compute is connected and
busy running queries, those queries will fail if they try to fetch any
pages. But when the tenant is re-attached, things should start working
again, without disconnecting the client <-> postgres connections.
Without this fix, this reproduced the segfault.

Fixes issue #3231
---
 pgxn/neon/libpagestore.c                  |  36 +++---
 test_runner/regress/test_tenant_detach.py | 149 ++++++++++++++++++++++
 2 files changed, 169 insertions(+), 16 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 0760842627..88e3a12d96 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -243,29 +243,33 @@ pageserver_receive(void)
 	PG_TRY();
 	{
 		/* read response */
-		resp_buff.len = call_PQgetCopyData(&resp_buff.data);
-		resp_buff.cursor = 0;
+		int			rc;
 
-		if (resp_buff.len < 0)
+		rc = call_PQgetCopyData(&resp_buff.data);
+		if (rc >= 0)
 		{
-			if (resp_buff.len == -1)
+			resp_buff.len = rc;
+			resp_buff.cursor = 0;
+			resp = nm_unpack_response(&resp_buff);
+			PQfreemem(resp_buff.data);
+
+			if (message_level_is_interesting(PageStoreTrace))
 			{
-				pageserver_disconnect();
-				return NULL;
+				char	   *msg = nm_to_string((NeonMessage *) resp);
+
+				neon_log(PageStoreTrace, "got response: %s", msg);
+				pfree(msg);
 			}
-			else if (resp_buff.len == -2)
-				neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 		}
-		resp = nm_unpack_response(&resp_buff);
-		PQfreemem(resp_buff.data);
-
-		if (message_level_is_interesting(PageStoreTrace))
+		else if (rc == -1)
 		{
-			char	   *msg = nm_to_string((NeonMessage *) resp);
-
-			neon_log(PageStoreTrace, "got response: %s", msg);
-			pfree(msg);
+			pageserver_disconnect();
+			resp = NULL;
 		}
+		else if (rc == -2)
+			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+		else
+			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
 	PG_CATCH();
 	{
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 6963a57542..db5bb679f2 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,9 +1,13 @@
+import asyncio
+import random
 import time
 from threading import Thread
 
+import asyncpg
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    NeonEnv,
     NeonEnvBuilder,
     PageserverApiException,
     PageserverHttpClient,
@@ -12,6 +16,7 @@ from fixtures.neon_fixtures import (
     available_remote_storages,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_until,
     wait_until_tenant_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -84,6 +89,150 @@ def test_tenant_reattach(
     assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
+num_connections = 10
+num_rows = 100000
+updates_to_perform = 0
+
+updates_started = 0
+updates_finished = 0
+
+
+# Run random UPDATEs on test table. On failure, try again.
+async def update_table(pg_conn: asyncpg.Connection):
+    global updates_started, updates_finished, updates_to_perform
+
+    while updates_started < updates_to_perform or updates_to_perform == 0:
+        updates_started += 1
+        id = random.randrange(1, num_rows)
+
+        # Loop to retry until the UPDATE succeeds
+        while True:
+            try:
+                await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
+                updates_finished += 1
+                if updates_finished % 1000 == 0:
+                    log.info(f"update {updates_finished} / {updates_to_perform}")
+                break
+            except asyncpg.PostgresError as e:
+                # Received error from Postgres. Log it, sleep a little, and continue
+                log.info(f"UPDATE error: {e}")
+                await asyncio.sleep(0.1)
+
+
+async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    global updates_started, updates_finished, updates_to_perform
+
+    # Wait until we have performed some updates
+    wait_until(20, 0.5, lambda: updates_finished > 500)
+
+    log.info("Detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    await asyncio.sleep(1)
+    log.info("Re-attaching tenant")
+    pageserver_http.tenant_attach(tenant_id)
+    log.info("Re-attach finished")
+
+    # Continue with 5000 more updates
+    updates_to_perform = updates_started + 5000
+
+
+# async guts of test_tenant_reattach_while_bysy test
+async def reattach_while_busy(
+    env: NeonEnv, pg: Postgres, pageserver_http: PageserverHttpClient, tenant_id: TenantId
+):
+    workers = []
+    for worker_id in range(num_connections):
+        pg_conn = await pg.connect_async()
+        workers.append(asyncio.create_task(update_table(pg_conn)))
+
+    workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
+    await asyncio.gather(*workers)
+
+    assert updates_finished == updates_to_perform
+
+
+# Detach and re-attach tenant, while compute is busy running queries.
+#
+# Some of the queries may fail, in the window that the tenant has been
+# detached but not yet re-attached. But Postgres itself should keep
+# running, and when we retry the queries, they should start working
+# after the attach has finished.
+
+# FIXME:
+#
+# This is pretty unstable at the moment. I've seen it fail with a warning like this:
+#
+# AssertionError: assert not ['2023-01-05T13:09:40.708303Z  WARN remote_upload{tenant=c3fc41f6cf29a7626b90316e3518cd4b timeline=7978246f85faa71ab03...1282b/000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001716699-0000000001736681"\n']
+#
+# (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3232/debug/3846817847/index.html#suites/f9eba3cfdb71aa6e2b54f6466222829b/470fc62b5db7d7d7/)
+# I believe that failure happened because there is a race condition
+# between detach and starting remote upload tasks:
+#
+# 1. detach_timeline calls task_mgr::shutdown_tasks(), sending shutdown
+#    signal to all in-progress tasks associated with the tenant.
+# 2. Just after shutdown_tasks() has collected the list of tasks,
+#    a new remote-upload task is spawned.
+#
+# See https://github.com/neondatabase/neon/issues/3273
+#
+#
+# I also saw this failure:
+#
+# test_runner/regress/test_tenant_detach.py:194: in test_tenant_reattach_while_busy
+#     asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/runners.py:44: in run
+#     return loop.run_until_complete(main)
+# /home/nonroot/.pyenv/versions/3.9.2/lib/python3.9/asyncio/base_events.py:642: in run_until_complete
+#     return future.result()
+# test_runner/regress/test_tenant_detach.py:151: in reattach_while_busy
+#     assert updates_finished == updates_to_perform
+# E   assert 5010 == 10010
+# E     +5010
+# E     -10010
+#
+# I don't know what's causing that...
+@pytest.mark.skip(reason="fixme")
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_reattach_while_busy(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_reattach_while_busy",
+    )
+    env = neon_env_builder.init_start()
+
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+    env.pageserver.allowed_errors.append(
+        ".*Tenant .* will not become active\\. Current state: Stopping.*"
+    )
+
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        # Create layers aggressively
+        conf={"checkpoint_distance": "100000"}
+    )
+
+    pg = env.postgres.create_start("main", tenant_id=tenant_id)
+
+    cur = pg.connect().cursor()
+
+    cur.execute("CREATE TABLE t(id int primary key, counter int)")
+    cur.execute(f"INSERT INTO t SELECT generate_series(1,{num_rows}), 0")
+
+    # Run the test
+    asyncio.run(reattach_while_busy(env, pg, pageserver_http, tenant_id))
+
+    # Verify table contents
+    assert query_scalar(cur, "SELECT count(*) FROM t") == num_rows
+    assert query_scalar(cur, "SELECT sum(counter) FROM t") == updates_to_perform
+
+
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()

From b6237474d245f8c633752ca809732aef87fb3944 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Fri, 6 Jan 2023 12:26:14 +0200
Subject: [PATCH 26/42] Fix README and basic startup example (#3275)

Follow-up of https://github.com/neondatabase/neon/pull/3270 which made
an example from main README.md not working.

Fixes that, by adding a way to specify a default tenant now and modifies
the basic neon_local test to start postgres and check branching.
Not all neon_local commands are implemented, so not all README.md
contents is tested yet.
---
 README.md                                  |  11 ++-
 control_plane/src/bin/neon_local.rs        |  20 ++++-
 control_plane/src/local_env.rs             |   5 --
 test_runner/fixtures/neon_fixtures.py      | 100 ++++++++-------------
 test_runner/regress/test_neon_local_cli.py |  13 ++-
 5 files changed, 72 insertions(+), 77 deletions(-)

diff --git a/README.md b/README.md
index fa5c1626e4..7b629e71a5 100644
--- a/README.md
+++ b/README.md
@@ -118,11 +118,8 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
-pageserver started, pid: 2545906
-Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
-Stopped pageserver 1 process with pid 2545906
 
-# start pageserver and safekeeper
+# start pageserver, safekeeper, and broker for their intercommunication
 > ./target/debug/neon_local start
 Starting neon broker at 127.0.0.1:50051
 storage_broker started, pid: 2918372
@@ -131,6 +128,12 @@ pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
 
+# create initial tenant and use it as a default for every future neon_local invocation
+> ./target/debug/neon_local tenant create --set-default
+tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
+Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
+Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
+
 # start postgres compute node
 > ./target/debug/neon_local pg start main
 Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index e4d0680c9e..4b2aa3c957 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -263,7 +263,7 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
     } else if let Some(default_id) = env.default_tenant_id {
         Ok(default_id)
     } else {
-        bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file");
+        anyhow::bail!("No tenant id. Use --tenant-id, or set a default tenant");
     }
 }
 
@@ -372,6 +372,17 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
             println!(
                 "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
             );
+
+            if create_match.get_flag("set-default") {
+                println!("Setting tenant {new_tenant_id} as a default one");
+                env.default_tenant_id = Some(new_tenant_id);
+            }
+        }
+        Some(("set-default", set_default_match)) => {
+            let tenant_id =
+                parse_tenant_id(set_default_match)?.context("No tenant id specified")?;
+            println!("Setting tenant {tenant_id} as a default one");
+            env.default_tenant_id = Some(tenant_id);
         }
         Some(("config", create_match)) => {
             let tenant_id = get_tenant_id(create_match, env)?;
@@ -975,11 +986,14 @@ fn cli() -> Command {
                 .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                 .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
                 .arg(pg_version_arg.clone())
+                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
+                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
                 )
+            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
+                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
             .subcommand(Command::new("config")
                 .arg(tenant_id_arg.clone())
-                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))
-                )
+                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
         )
         .subcommand(
             Command::new("pageserver")
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index ea936640ec..003152c578 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -296,11 +296,6 @@ impl LocalEnv {
             env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
         }
 
-        // If no initial tenant ID was given, generate it.
-        if env.default_tenant_id.is_none() {
-            env.default_tenant_id = Some(TenantId::generate());
-        }
-
         env.base_data_dir = base_path();
 
         Ok(env)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 97bc694543..bdd3dc004e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -18,6 +18,7 @@ from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from enum import Flag, auto
 from functools import cached_property
+from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
@@ -1567,6 +1568,7 @@ class NeonCli(AbstractNeonCli):
         tenant_id: Optional[TenantId] = None,
         timeline_id: Optional[TimelineId] = None,
         conf: Optional[Dict[str, str]] = None,
+        set_default: bool = False,
     ) -> Tuple[TenantId, TimelineId]:
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -1575,47 +1577,51 @@ class NeonCli(AbstractNeonCli):
             tenant_id = TenantId.generate()
         if timeline_id is None:
             timeline_id = TimelineId.generate()
-        if conf is None:
-            res = self.raw_cli(
-                [
-                    "tenant",
-                    "create",
-                    "--tenant-id",
-                    str(tenant_id),
-                    "--timeline-id",
-                    str(timeline_id),
-                    "--pg-version",
-                    self.env.pg_version,
-                ]
-            )
-        else:
-            res = self.raw_cli(
-                [
-                    "tenant",
-                    "create",
-                    "--tenant-id",
-                    str(tenant_id),
-                    "--timeline-id",
-                    str(timeline_id),
-                    "--pg-version",
-                    self.env.pg_version,
-                ]
-                + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
+
+        args = [
+            "tenant",
+            "create",
+            "--tenant-id",
+            str(tenant_id),
+            "--timeline-id",
+            str(timeline_id),
+            "--pg-version",
+            self.env.pg_version,
+        ]
+        if conf is not None:
+            args.extend(
+                chain.from_iterable(
+                    product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
+                )
             )
+        if set_default:
+            args.append("--set-default")
+
+        res = self.raw_cli(args)
         res.check_returncode()
         return tenant_id, timeline_id
 
+    def set_default(self, tenant_id: TenantId):
+        """
+        Update default tenant for future operations that require tenant_id.
+        """
+        res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)])
+        res.check_returncode()
+
     def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]):
         """
         Update tenant config.
         """
-        if conf is None:
-            res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)])
-        else:
-            res = self.raw_cli(
-                ["tenant", "config", "--tenant-id", str(tenant_id)]
-                + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), [])
+
+        args = ["tenant", "config", "--tenant-id", str(tenant_id)]
+        if conf is not None:
+            args.extend(
+                chain.from_iterable(
+                    product(["-c"], (f"{key}:{value}" for key, value in conf.items()))
+                )
             )
+
+        res = self.raw_cli(args)
         res.check_returncode()
 
     def list_tenants(self) -> "subprocess.CompletedProcess[str]":
@@ -1650,36 +1656,6 @@ class NeonCli(AbstractNeonCli):
 
         return TimelineId(str(created_timeline_id))
 
-    def create_root_branch(
-        self,
-        branch_name: str,
-        tenant_id: Optional[TenantId] = None,
-    ):
-        cmd = [
-            "timeline",
-            "create",
-            "--branch-name",
-            branch_name,
-            "--tenant-id",
-            str(tenant_id or self.env.initial_tenant),
-            "--pg-version",
-            self.env.pg_version,
-        ]
-
-        res = self.raw_cli(cmd)
-        res.check_returncode()
-
-        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
-
-        created_timeline_id = None
-        if matches is not None:
-            created_timeline_id = matches.group("timeline_id")
-
-        if created_timeline_id is None:
-            raise Exception("could not find timeline id after `neon timeline create` invocation")
-        else:
-            return TimelineId(created_timeline_id)
-
     def create_branch(
         self,
         new_branch_name: str = DEFAULT_BRANCH_NAME,
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index e8f01ccf55..49c063ce44 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -2,9 +2,16 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 
 
 # Test that neon cli is able to start and stop all processes with the user defaults.
-# def test_neon_cli_basics(neon_simple_env: NeonEnv):
+# Repeats the example from README.md as close as it can
 def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()
+    # Skipping the init step that creates a local tenant in Pytest tests
+    try:
+        env.neon_cli.start()
+        env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
+        env.neon_cli.pg_start(node_name="main")
 
-    env.neon_cli.start()
-    env.neon_cli.stop()
+        env.neon_cli.create_branch(new_branch_name="migration_check")
+        env.neon_cli.pg_start(node_name="migration_check")
+    finally:
+        env.neon_cli.stop()

From df42213dbb5958a50cb6a80743b69d9b50836507 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Jan 2023 14:52:20 +0200
Subject: [PATCH 27/42] Fix missing COMMIT in handle_role_deletions.

There was no COMMIT, so the DROP ROLE commands were always implicitly
rolled back.

Fixes issue #3279.
---
 compute_tools/src/spec.rs | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 58c94d74ae..ce396f4527 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,22 +197,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
 /// Reassign all dependent objects and delete requested roles.
 pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    let spec = &node.spec;
-
-    // First, reassign all dependent objects to db owners.
-    if let Some(ops) = &spec.delta_operations {
+    if let Some(ops) = &node.spec.delta_operations {
+        // First, reassign all dependent objects to db owners.
         info!("reassigning dependent objects of to-be-deleted roles");
         for op in ops {
             if op.action == "delete_role" {
                 reassign_owned_objects(node, &op.name)?;
             }
         }
-    }
 
-    // Second, proceed with role deletions.
-    let mut xact = client.transaction()?;
-    if let Some(ops) = &spec.delta_operations {
+        // Second, proceed with role deletions.
         info!("processing role deletions");
+        let mut xact = client.transaction()?;
         for op in ops {
             // We do not check either role exists or not,
             // Postgres will take care of it for us
@@ -223,6 +219,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
                 xact.execute(query.as_str(), &[])?;
             }
         }
+        xact.commit()?;
     }
 
     Ok(())

From debd134b15083ebd0587b760232724e0a644af31 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 6 Jan 2023 19:34:18 +0400
Subject: [PATCH 28/42] Implement wss support in proxy (#3247)

This is a hacky implementation of WebSocket server, embedded into our
postgres proxy. The server is used to allow https://github.com/neondatabase/serverless
to connect to our postgres from browser and serverless javascript functions.

How it will work (general schema):
- browser opens a websocket connection to
`wss://ep-abc-xyz-123.xx-central-1.aws.neon.tech/`
- proxy accepts this connection and terminates TLS (https)
- inside encrypted tunnel (HTTPS), browser initiates plain
(non-encrypted) postgres connection
- proxy performs auth as in usual plain pg connection and forwards
connection to the compute

Related issue: #3225
---
 Cargo.lock                    |  79 ++++++++++
 proxy/Cargo.toml              |   4 +
 proxy/src/auth/backend.rs     |  32 ++++-
 proxy/src/auth/credentials.rs |  23 ++-
 proxy/src/auth/flow.rs        |  23 +++
 proxy/src/http.rs             |   1 +
 proxy/src/http/websocket.rs   | 263 ++++++++++++++++++++++++++++++++++
 proxy/src/main.rs             |  22 ++-
 proxy/src/proxy.rs            |  43 +++++-
 9 files changed, 476 insertions(+), 14 deletions(-)
 create mode 100644 proxy/src/http/websocket.rs

diff --git a/Cargo.lock b/Cargo.lock
index fbf018e1c0..284a111ba7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1700,6 +1700,19 @@ dependencies = [
  "tokio-io-timeout",
 ]
 
+[[package]]
+name = "hyper-tungstenite"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d62004bcd4f6f85d9e2aa4206f1466ee67031f5ededcb6c6e62d48f9306ad879"
+dependencies = [
+ "hyper",
+ "pin-project",
+ "tokio",
+ "tokio-tungstenite",
+ "tungstenite",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.53"
@@ -2658,6 +2671,7 @@ dependencies = [
  "hex",
  "hmac",
  "hyper",
+ "hyper-tungstenite",
  "itertools",
  "md5",
  "metrics",
@@ -2667,6 +2681,7 @@ dependencies = [
  "pq_proto",
  "rand",
  "rcgen",
+ "regex",
  "reqwest",
  "routerify",
  "rstest",
@@ -2678,6 +2693,7 @@ dependencies = [
  "sha2",
  "socket2",
  "thiserror",
+ "tls-listener",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
@@ -2687,6 +2703,7 @@ dependencies = [
  "url",
  "utils",
  "uuid",
+ "webpki-roots",
  "workspace_hack",
  "x509-parser",
 ]
@@ -3324,6 +3341,17 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "sha-1"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "sha1"
 version = "0.10.5"
@@ -3687,6 +3715,20 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
 
+[[package]]
+name = "tls-listener"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9d4ff21187d434ac7709bfc7441ca88f63681247e5ad99f0f08c8c91ddc103d"
+dependencies = [
+ "futures-util",
+ "hyper",
+ "pin-project-lite",
+ "thiserror",
+ "tokio",
+ "tokio-rustls",
+]
+
 [[package]]
 name = "tokio"
 version = "1.21.1"
@@ -3801,6 +3843,18 @@ dependencies = [
  "xattr",
 ]
 
+[[package]]
+name = "tokio-tungstenite"
+version = "0.17.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f714dd15bead90401d77e04243611caec13726c2408afd5b31901dfcdcb3b181"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.4"
@@ -4027,6 +4081,25 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"
 
+[[package]]
+name = "tungstenite"
+version = "0.17.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e27992fd6a8c29ee7eef28fc78349aa244134e10ad447ce3b9f0ac0ed0fa4ce0"
+dependencies = [
+ "base64 0.13.1",
+ "byteorder",
+ "bytes",
+ "http",
+ "httparse",
+ "log",
+ "rand",
+ "sha-1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -4115,6 +4188,12 @@ version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8db7427f936968176eaa7cdf81b7f98b980b18495ec28f1b5791ac3bfe3eea9"
 
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
 [[package]]
 name = "utils"
 version = "0.1.0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0bf47c7b88..cbc067093e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -17,12 +17,14 @@ hashbrown = "0.12"
 hex = "0.4.3"
 hmac = "0.12.1"
 hyper = "0.14"
+hyper-tungstenite = "0.8.1"
 itertools = "0.10.3"
 md5 = "0.7.0"
 once_cell = "1.13.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2.7"
 rand = "0.8.3"
+regex = "1.4.5"
 reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] }
 routerify = "3"
 rustls = "0.20.0"
@@ -36,10 +38,12 @@ thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 tokio-rustls = "0.23.0"
+tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] }
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 url = "2.2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
+webpki-roots = "0.22.5"
 x509-parser = "0.14"
 
 metrics = { path = "../libs/metrics" }
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 4adf0ed940..e6a179a040 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -149,7 +149,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
         // If there's no project so far, that entails that client doesn't
         // support SNI or other means of passing the project name.
         // We now expect to see a very specific payload in the place of password.
-        let fetch_magic_payload = async {
+        let fetch_magic_payload = |client| async {
             warn!("project name not specified, resorting to the password hack auth flow");
             let payload = AuthFlow::new(client)
                 .begin(auth::PasswordHack)
@@ -161,10 +161,26 @@ impl BackendType<'_, ClientCredentials<'_>> {
             auth::Result::Ok(payload)
         };
 
+        // If we want to use cleartext password flow, we can read the password
+        // from the client and pretend that it's a magic payload (PasswordHack hack).
+        let fetch_plaintext_password = |client| async {
+            info!("using cleartext password flow");
+            let payload = AuthFlow::new(client)
+                .begin(auth::CleartextPassword)
+                .await?
+                .authenticate()
+                .await?;
+
+            auth::Result::Ok(auth::password_hack::PasswordHackPayload {
+                project: String::new(),
+                password: payload,
+            })
+        };
+
         // TODO: find a proper way to merge those very similar blocks.
         let (mut node, payload) = match self {
             Console(endpoint, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload.await?;
+                let payload = fetch_magic_payload(client).await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
@@ -174,8 +190,18 @@ impl BackendType<'_, ClientCredentials<'_>> {
 
                 (node, payload)
             }
+            Console(endpoint, creds) if creds.use_cleartext_password_flow => {
+                // This is a hack to allow cleartext password in secure connections (wss).
+                let payload = fetch_plaintext_password(client).await?;
+                let creds = creds.as_ref();
+                let node = console::Api::new(endpoint, extra, &creds)
+                    .wake_compute()
+                    .await?;
+
+                (node, payload)
+            }
             Postgres(endpoint, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload.await?;
+                let payload = fetch_magic_payload(client).await?;
 
                 let mut creds = creds.as_ref();
                 creds.project = Some(payload.project.as_str().into());
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 0a3b84bb52..3b71bef9aa 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -34,6 +34,9 @@ pub struct ClientCredentials<'a> {
     pub user: &'a str,
     pub dbname: &'a str,
     pub project: Option<Cow<'a, str>>,
+    /// If `True`, we'll use the old cleartext password flow. This is used for
+    /// websocket connections, which want to minimize the number of round trips.
+    pub use_cleartext_password_flow: bool,
 }
 
 impl ClientCredentials<'_> {
@@ -50,6 +53,7 @@ impl<'a> ClientCredentials<'a> {
             user: self.user,
             dbname: self.dbname,
             project: self.project().map(Cow::Borrowed),
+            use_cleartext_password_flow: self.use_cleartext_password_flow,
         }
     }
 }
@@ -59,6 +63,7 @@ impl<'a> ClientCredentials<'a> {
         params: &'a StartupMessageParams,
         sni: Option<&str>,
         common_name: Option<&str>,
+        use_cleartext_password_flow: bool,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
@@ -108,6 +113,7 @@ impl<'a> ClientCredentials<'a> {
             user = user,
             dbname = dbname,
             project = project.as_deref(),
+            use_cleartext_password_flow = use_cleartext_password_flow,
             "credentials"
         );
 
@@ -115,6 +121,7 @@ impl<'a> ClientCredentials<'a> {
             user,
             dbname,
             project,
+            use_cleartext_password_flow,
         })
     }
 }
@@ -141,7 +148,7 @@ mod tests {
         let options = StartupMessageParams::new([("user", "john_doe")]);
 
         // TODO: check that `creds.dbname` is None.
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
 
         Ok(())
@@ -151,7 +158,7 @@ mod tests {
     fn parse_missing_project() -> anyhow::Result<()> {
         let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
 
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project, None);
@@ -166,7 +173,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -182,7 +189,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let creds = ClientCredentials::parse(&options, None, None, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -201,7 +208,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.dbname, "world");
         assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -220,7 +227,8 @@ mod tests {
         let sni = Some("second.localhost");
         let common_name = Some("localhost");
 
-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err =
+            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -237,7 +245,8 @@ mod tests {
         let sni = Some("project.localhost");
         let common_name = Some("example.com");
 
-        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
+        let err =
+            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
         match err {
             InconsistentSni { sni, cn } => {
                 assert_eq!(sni, "project.localhost");
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index d9ee50894d..4b982c0c5e 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -37,6 +37,17 @@ impl AuthMethod for PasswordHack {
     }
 }
 
+/// Use clear-text password auth called `password` in docs
+/// <https://www.postgresql.org/docs/current/auth-password.html>
+pub struct CleartextPassword;
+
+impl AuthMethod for CleartextPassword {
+    #[inline(always)]
+    fn first_message(&self) -> BeMessage<'_> {
+        Be::AuthenticationCleartextPassword
+    }
+}
+
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
 pub struct AuthFlow<'a, Stream, State> {
@@ -86,6 +97,18 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
     }
 }
 
+impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
+    /// Perform user authentication. Raise an error in case authentication failed.
+    pub async fn authenticate(self) -> super::Result<Vec<u8>> {
+        let msg = self.stream.read_password_message().await?;
+        let password = msg
+            .strip_suffix(&[0])
+            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+
+        Ok(password.to_vec())
+    }
+}
+
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 096a33d73d..e847edc8bd 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -1,4 +1,5 @@
 pub mod server;
+pub mod websocket;
 
 use crate::url::ApiUrl;
 
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
new file mode 100644
index 0000000000..33c2752307
--- /dev/null
+++ b/proxy/src/http/websocket.rs
@@ -0,0 +1,263 @@
+use bytes::{Buf, Bytes};
+use futures::{Sink, Stream, StreamExt};
+use hyper::server::accept::{self};
+use hyper::server::conn::AddrIncoming;
+use hyper::upgrade::Upgraded;
+use hyper::{Body, Request, Response, StatusCode};
+use hyper_tungstenite::{tungstenite, WebSocketStream};
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket};
+use pin_project_lite::pin_project;
+use tokio::net::TcpListener;
+
+use std::convert::Infallible;
+use std::future::ready;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use tls_listener::TlsListener;
+
+use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
+
+use tracing::{error, info, info_span, warn, Instrument};
+use utils::http::{error::ApiError, json::json_response};
+
+use crate::cancellation::CancelMap;
+use crate::config::ProxyConfig;
+use crate::proxy::handle_ws_client;
+
+pin_project! {
+    /// This is a wrapper around a WebSocketStream that implements AsyncRead and AsyncWrite.
+    pub struct WebSocketRW {
+        #[pin]
+        stream: WebSocketStream<Upgraded>,
+        chunk: Option<bytes::Bytes>,
+    }
+}
+
+// FIXME: explain why this is safe or try to remove `unsafe impl`.
+unsafe impl Sync for WebSocketRW {}
+
+impl WebSocketRW {
+    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+        Self {
+            stream,
+            chunk: None,
+        }
+    }
+
+    fn has_chunk(&self) -> bool {
+        if let Some(ref chunk) = self.chunk {
+            chunk.remaining() > 0
+        } else {
+            false
+        }
+    }
+}
+
+fn ws_err_into(e: tungstenite::Error) -> io::Error {
+    io::Error::new(io::ErrorKind::Other, e.to_string())
+}
+
+impl AsyncWrite for WebSocketRW {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        let mut this = self.project();
+        match this.stream.as_mut().poll_ready(cx) {
+            Poll::Ready(Ok(())) => {
+                if let Err(e) = this
+                    .stream
+                    .as_mut()
+                    .start_send(Message::Binary(buf.to_vec()))
+                {
+                    Poll::Ready(Err(ws_err_into(e)))
+                } else {
+                    Poll::Ready(Ok(buf.len()))
+                }
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(ws_err_into(e))),
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+        }
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().stream.poll_flush(cx).map_err(ws_err_into)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().stream.poll_close(cx).map_err(ws_err_into)
+    }
+}
+
+impl AsyncRead for WebSocketRW {
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        if buf.remaining() == 0 {
+            return Poll::Ready(Ok(()));
+        }
+
+        let inner_buf = match self.as_mut().poll_fill_buf(cx) {
+            Poll::Ready(Ok(buf)) => buf,
+            Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+            Poll::Pending => return Poll::Pending,
+        };
+        let len = std::cmp::min(inner_buf.len(), buf.remaining());
+        buf.put_slice(&inner_buf[..len]);
+
+        self.consume(len);
+        Poll::Ready(Ok(()))
+    }
+}
+
+impl AsyncBufRead for WebSocketRW {
+    fn poll_fill_buf(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+        loop {
+            if self.as_mut().has_chunk() {
+                let buf = self.project().chunk.as_ref().unwrap().chunk();
+                return Poll::Ready(Ok(buf));
+            } else {
+                match self.as_mut().project().stream.poll_next(cx) {
+                    Poll::Ready(Some(Ok(message))) => match message {
+                        Message::Text(_) => {}
+                        Message::Binary(chunk) => {
+                            *self.as_mut().project().chunk = Some(Bytes::from(chunk));
+                        }
+                        Message::Ping(_) => {
+                            // No need to send a reply: tungstenite takes care of this for you.
+                        }
+                        Message::Pong(_) => {}
+                        Message::Close(_) => {
+                            // No need to send a reply: tungstenite takes care of this for you.
+                            return Poll::Ready(Ok(&[]));
+                        }
+                        Message::Frame(_) => {
+                            unreachable!();
+                        }
+                    },
+                    Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))),
+                    Poll::Ready(None) => return Poll::Ready(Ok(&[])),
+                    Poll::Pending => return Poll::Pending,
+                }
+            }
+        }
+    }
+
+    fn consume(self: Pin<&mut Self>, amt: usize) {
+        if amt > 0 {
+            self.project()
+                .chunk
+                .as_mut()
+                .expect("No chunk present")
+                .advance(amt);
+        }
+    }
+}
+
+async fn serve_websocket(
+    websocket: HyperWebsocket,
+    config: &ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    let websocket = websocket.await?;
+    handle_ws_client(
+        config,
+        cancel_map,
+        session_id,
+        WebSocketRW::new(websocket),
+        hostname,
+    )
+    .await?;
+    Ok(())
+}
+
+async fn ws_handler(
+    mut request: Request<Body>,
+    config: &'static ProxyConfig,
+    cancel_map: Arc<CancelMap>,
+    session_id: uuid::Uuid,
+) -> Result<Response<Body>, ApiError> {
+    let host = request
+        .headers()
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next())
+        .map(|s| s.to_string());
+
+    // Check if the request is a websocket upgrade request.
+    if hyper_tungstenite::is_upgrade_request(&request) {
+        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+            .map_err(|e| ApiError::BadRequest(e.into()))?;
+
+        tokio::spawn(async move {
+            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
+            {
+                error!("error in websocket connection: {:?}", e);
+            }
+        });
+
+        // Return the response so the spawned future can continue.
+        Ok(response)
+    } else {
+        json_response(StatusCode::OK, "Connect with a websocket client")
+    }
+}
+
+pub async fn task_main(
+    ws_listener: TcpListener,
+    config: &'static ProxyConfig,
+) -> anyhow::Result<()> {
+    scopeguard::defer! {
+        info!("websocket server has shut down");
+    }
+
+    let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
+    let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
+        Some(config) => config.into(),
+        None => {
+            warn!("TLS config is missing, WebSocket Secure server will not be started");
+            return Ok(());
+        }
+    };
+
+    let addr_incoming = AddrIncoming::from_listener(ws_listener)?;
+
+    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
+        if let Err(err) = conn {
+            error!("failed to accept TLS connection for websockets: {:?}", err);
+            ready(false)
+        } else {
+            ready(true)
+        }
+    });
+
+    let make_svc = hyper::service::make_service_fn(|_stream| async move {
+        Ok::<_, Infallible>(hyper::service::service_fn(
+            move |req: Request<Body>| async move {
+                let cancel_map = Arc::new(CancelMap::default());
+                let session_id = uuid::Uuid::new_v4();
+                ws_handler(req, config, cancel_map, session_id)
+                    .instrument(info_span!(
+                        "ws-client",
+                        session = format_args!("{session_id}")
+                    ))
+                    .await
+            },
+        ))
+    });
+
+    hyper::Server::builder(accept::from_stream(tls_listener))
+        .serve(make_svc)
+        .await?;
+
+    Ok(())
+}
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 89ea9142a9..aa6766c102 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -110,12 +110,23 @@ async fn main() -> anyhow::Result<()> {
     info!("Starting proxy on {proxy_address}");
     let proxy_listener = TcpListener::bind(proxy_address).await?;
 
-    let tasks = [
+    let mut tasks = vec![
         tokio::spawn(http::server::task_main(http_listener)),
         tokio::spawn(proxy::task_main(config, proxy_listener)),
         tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)),
-    ]
-    .map(flatten_err);
+    ];
+
+    if let Some(wss_address) = arg_matches.get_one::<String>("wss") {
+        let wss_address: SocketAddr = wss_address.parse()?;
+        info!("Starting wss on {}", wss_address);
+        let wss_listener = TcpListener::bind(wss_address).await?;
+        tasks.push(tokio::spawn(http::websocket::task_main(
+            wss_listener,
+            config,
+        )));
+    }
+
+    let tasks = tasks.into_iter().map(flatten_err);
 
     set_build_info_metric(GIT_VERSION);
     // This will block until all tasks have completed.
@@ -155,6 +166,11 @@ fn cli() -> clap::Command {
                 .help("listen for incoming http connections (metrics, etc) on ip:port")
                 .default_value("127.0.0.1:7001"),
         )
+        .arg(
+            Arg::new("wss")
+                .long("wss")
+                .help("listen for incoming wss connections on ip:port"),
+        )
         .arg(
             Arg::new("uri")
                 .short('u')
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 382f7cd918..63573d49c0 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -82,6 +82,47 @@ pub async fn task_main(
     }
 }
 
+pub async fn handle_ws_client(
+    config: &ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    stream: impl AsyncRead + AsyncWrite + Unpin + Send,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    // The `closed` counter will increase when this future is destroyed.
+    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
+    scopeguard::defer! {
+        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
+    }
+
+    let tls = config.tls_config.as_ref();
+    let hostname = hostname.as_deref();
+
+    // TLS is None here, because the connection is already encrypted.
+    let do_handshake = handshake(stream, None, cancel_map).instrument(info_span!("handshake"));
+    let (mut stream, params) = match do_handshake.await? {
+        Some(x) => x,
+        None => return Ok(()), // it's a cancellation request
+    };
+
+    // Extract credentials which we're going to use for auth.
+    let creds = {
+        let common_name = tls.and_then(|tls| tls.common_name.as_deref());
+        let result = config
+            .auth_backend
+            .as_ref()
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
+            .transpose();
+
+        async { result }.or_else(|e| stream.throw_error(e)).await?
+    };
+
+    let client = Client::new(stream, creds, &params, session_id);
+    cancel_map
+        .with_session(|session| client.connect_to_db(session))
+        .await
+}
+
 async fn handle_client(
     config: &ProxyConfig,
     cancel_map: &CancelMap,
@@ -108,7 +149,7 @@ async fn handle_client(
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?

From af9425394ffe0b0ee17908c9840525d43312ca31 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Jan 2023 17:15:56 +0200
Subject: [PATCH 29/42] Print time taken by CREATE/ALTER DATABASE at compute
 start.

Trying to investigate why the "apply_config" stage is taking longer
than expected. This proves or disproves that it's the CREATE DATABASE
statement.
---
 compute_tools/src/spec.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ce396f4527..81e01fe555 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,5 +1,6 @@
 use std::path::Path;
 use std::str::FromStr;
+use std::time::Instant;
 
 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
@@ -314,6 +315,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
         // XXX: with a limited number of databases it is fine, but consider making it a HashMap
         let pg_db = existing_dbs.iter().find(|r| r.name == *name);
 
+        let start_time = Instant::now();
         if let Some(r) = pg_db {
             // XXX: db owner name is returned as quoted string from Postgres,
             // when quoting is needed.
@@ -332,6 +334,8 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 info_print!(" -> update");
 
                 client.execute(query.as_str(), &[])?;
+                let elapsed = start_time.elapsed().as_millis();
+                info_print!(" ({} ms)", elapsed);
             }
         } else {
             let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
@@ -339,6 +343,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
             query.push_str(&db.to_pg_options());
             client.execute(query.as_str(), &[])?;
+
+            let elapsed = start_time.elapsed().as_millis();
+            info_print!(" ({} ms)", elapsed);
         }
 
         info_print!("\n");

From 3526323bc470d763d70e85fe2a87f4269f17e5e0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 6 Jan 2023 18:42:25 +0100
Subject: [PATCH 30/42] prepare Timeline::get_reconstruct_data for becoming
 async  (#3271)

This patch restructures the code so that PR
https://github.com/neondatabase/neon/pull/3228 can seamlessly
replace the return PageReconstructResult::NeedsDownload with
a download_remote_layer().await.

Background:

PR https://github.com/neondatabase/neon/pull/3228 will turn
get_reconstruct_data() async and do the on-demand
download right in place, instead of returning a
PageReconstructResult::NeedsDownload.

Current rustc requires that the layers lock guard be not in scope
across an await point.

For on-demand download inside get_reconstruct_data(), we need
to do download_remote_layer().await.

Supersedes https://github.com/neondatabase/neon/pull/3260

See my comment there:
https://github.com/neondatabase/neon/pull/3260#issuecomment-1370752407

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/tenant/layer_map.rs |   4 +-
 pageserver/src/tenant/timeline.rs  | 210 ++++++++++++++++++-----------
 2 files changed, 134 insertions(+), 80 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 4ff2d4b0d8..44bed5959f 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -260,8 +260,10 @@ where
     /// contain the version, even if it's missing from the returned
     /// layer.
     ///
+    /// NOTE: This only searches the 'historic' layers, *not* the
+    /// 'open' and 'frozen' layers!
+    ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
-        // linear search
         // Find the latest image layer that covers the given key
         let mut latest_img: Option<Arc<L>> = None;
         let mut latest_img_lsn: Option<Lsn> = None;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2c22c6694d..477108ec4c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1642,8 +1642,7 @@ impl Timeline {
 
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
-        let mut traversal_path =
-            Vec::<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>::new();
+        let mut traversal_path = Vec::<TraversalPathItem>::new();
 
         let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
             *cached_lsn
@@ -1708,82 +1707,132 @@ impl Timeline {
                 timeline_owned = ancestor;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
-                continue;
+                continue 'outer;
             }
 
-            let layers = timeline.layers.read().unwrap();
+            #[allow(unused_labels, clippy::never_loop)] // see comment at bottom of this loop
+            'layer_map_search: loop {
+                let remote_layer = {
+                    let layers = timeline.layers.read().unwrap();
 
-            // Check the open and frozen in-memory layers first, in order from newest
-            // to oldest.
-            if let Some(open_layer) = &layers.open_layer {
-                let start_lsn = open_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
-                    // Get all the data needed to reconstruct the page version from this layer.
-                    // But if we have an older cached page image, no need to go past that.
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = match open_layer.get_value_reconstruct_data(
-                        key,
-                        lsn_floor..cont_lsn,
-                        reconstruct_state,
-                    ) {
-                        Ok(result) => result,
-                        Err(e) => return PageReconstructResult::from(e),
-                    };
-                    cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, Box::new(open_layer.clone())));
-                    continue;
-                }
-            }
-            for frozen_layer in layers.frozen_layers.iter().rev() {
-                let start_lsn = frozen_layer.get_lsn_range().start;
-                if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
-                    let lsn_floor = max(cached_lsn + 1, start_lsn);
-                    result = match frozen_layer.get_value_reconstruct_data(
-                        key,
-                        lsn_floor..cont_lsn,
-                        reconstruct_state,
-                    ) {
-                        Ok(result) => result,
-                        Err(e) => return PageReconstructResult::from(e),
-                    };
-                    cont_lsn = lsn_floor;
-                    traversal_path.push((result, cont_lsn, Box::new(frozen_layer.clone())));
-                    continue 'outer;
-                }
-            }
+                    // Check the open and frozen in-memory layers first, in order from newest
+                    // to oldest.
+                    if let Some(open_layer) = &layers.open_layer {
+                        let start_lsn = open_layer.get_lsn_range().start;
+                        if cont_lsn > start_lsn {
+                            //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
+                            // Get all the data needed to reconstruct the page version from this layer.
+                            // But if we have an older cached page image, no need to go past that.
+                            let lsn_floor = max(cached_lsn + 1, start_lsn);
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let open_layer = Arc::clone(open_layer);
+                                    move || open_layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    }
+                    for frozen_layer in layers.frozen_layers.iter().rev() {
+                        let start_lsn = frozen_layer.get_lsn_range().start;
+                        if cont_lsn > start_lsn {
+                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
+                            let lsn_floor = max(cached_lsn + 1, start_lsn);
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let frozen_layer = Arc::clone(frozen_layer);
+                                    move || frozen_layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    }
 
-            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display());
-
-                // If it's a remote layer, the caller can do the download and retry.
-                if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) {
-                    info!("need remote layer {}", layer.traversal_id());
-                    return PageReconstructResult::NeedsDownload(
-                        Weak::clone(&timeline.myself),
-                        Arc::downgrade(&remote_layer),
-                    );
-                }
-
-                let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                result = match layer.get_value_reconstruct_data(
-                    key,
-                    lsn_floor..cont_lsn,
-                    reconstruct_state,
-                ) {
-                    Ok(result) => result,
-                    Err(e) => return PageReconstructResult::from(e),
+                    if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
+                        // If it's a remote layer, download it and retry.
+                        if let Some(remote_layer) =
+                            super::storage_layer::downcast_remote_layer(&layer)
+                        {
+                            // TODO: push a breadcrumb to 'traversal_path' to record the fact that
+                            // we downloaded / would need to download this layer.
+                            remote_layer // download happens outside the scope of `layers` guard object
+                        } else {
+                            // Get all the data needed to reconstruct the page version from this layer.
+                            // But if we have an older cached page image, no need to go past that.
+                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                            ) {
+                                Ok(result) => result,
+                                Err(e) => return PageReconstructResult::from(e),
+                            };
+                            cont_lsn = lsn_floor;
+                            traversal_path.push((
+                                result,
+                                cont_lsn,
+                                Box::new({
+                                    let layer = Arc::clone(&layer);
+                                    move || layer.traversal_id()
+                                }),
+                            ));
+                            continue 'outer;
+                        }
+                    } else if timeline.ancestor_timeline.is_some() {
+                        // Nothing on this timeline. Traverse to parent
+                        result = ValueReconstructResult::Continue;
+                        cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+                        continue 'outer;
+                    } else {
+                        // Nothing found
+                        result = ValueReconstructResult::Missing;
+                        continue 'outer;
+                    }
                 };
-                cont_lsn = lsn_floor;
-                traversal_path.push((result, cont_lsn, Box::new(layer.clone())));
-            } else if timeline.ancestor_timeline.is_some() {
-                // Nothing on this timeline. Traverse to parent
-                result = ValueReconstructResult::Continue;
-                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
-            } else {
-                // Nothing found
-                result = ValueReconstructResult::Missing;
+                // Indicate to the caller that we need remote_layer replaced with a downloaded
+                // layer in the layer map. The control flow could be a lot simpler, but the point
+                // of this commit is to prepare this function to
+                // 1. become async
+                // 2. do the download right here, using
+                //    ```
+                //    download_remote_layer().await?;
+                //    continue 'layer_map_search;
+                //    ```
+                // For (2), current rustc requires that the layers lock guard is not in scope.
+                // Hence, the complicated control flow.
+                let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
+                    Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
+                info!(
+                    "need remote layer {}",
+                    remote_layer_as_persistent.traversal_id()
+                );
+                return PageReconstructResult::NeedsDownload(
+                    Weak::clone(&timeline.myself),
+                    Arc::downgrade(&remote_layer),
+                );
             }
         }
     }
@@ -3358,22 +3407,25 @@ where
     }
 }
 
+type TraversalPathItem = (
+    ValueReconstructResult,
+    Lsn,
+    Box<dyn FnOnce() -> TraversalId>,
+);
+
 /// Helper function for get_reconstruct_data() to add the path of layers traversed
 /// to an error, as anyhow context information.
-fn layer_traversal_error(
-    msg: String,
-    path: Vec<(ValueReconstructResult, Lsn, Box<dyn TraversalLayerExt>)>,
-) -> PageReconstructResult<()> {
+fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructResult<()> {
     // We want the original 'msg' to be the outermost context. The outermost context
     // is the most high-level information, which also gets propagated to the client.
     let mut msg_iter = path
-        .iter()
+        .into_iter()
         .map(|(r, c, l)| {
             format!(
                 "layer traversal: result {:?}, cont_lsn {}, layer: {}",
                 r,
                 c,
-                l.traversal_id(),
+                l(),
             )
         })
         .chain(std::iter::once(msg));

From 23d5e2bdaa6ca7142e54be0fd223ec2313d224bf Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Sat, 7 Jan 2023 00:46:42 +0200
Subject: [PATCH 31/42] Fix common pg port in the CLI basics test (#3283)

Closes https://github.com/neondatabase/neon/issues/3282
---
 test_runner/regress/test_neon_local_cli.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 49c063ce44..bd0f550ba5 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -1,17 +1,17 @@
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor
 
 
 # Test that neon cli is able to start and stop all processes with the user defaults.
 # Repeats the example from README.md as close as it can
-def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder):
+def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: PortDistributor):
     env = neon_env_builder.init_configs()
     # Skipping the init step that creates a local tenant in Pytest tests
     try:
         env.neon_cli.start()
         env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
-        env.neon_cli.pg_start(node_name="main")
+        env.neon_cli.pg_start(node_name="main", port=port_distributor.get_port())
 
         env.neon_cli.create_branch(new_branch_name="migration_check")
-        env.neon_cli.pg_start(node_name="migration_check")
+        env.neon_cli.pg_start(node_name="migration_check", port=port_distributor.get_port())
     finally:
         env.neon_cli.stop()

From 7920b39a275a7bd13155d8726c92cd417bf7e2f9 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 9 Jan 2023 10:24:50 +0200
Subject: [PATCH 32/42] Adding transition reason to the log when a tenant is
 moved to Broken state  (#3289)

#3160
---
 pageserver/src/tenant.rs     | 16 ++++++++++------
 pageserver/src/tenant/mgr.rs |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 72404e98cd..71cdc6cf1c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -596,7 +596,7 @@ impl Tenant {
                 match tenant_clone.attach().await {
                     Ok(_) => {}
                     Err(e) => {
-                        tenant_clone.set_broken();
+                        tenant_clone.set_broken(&e.to_string());
                         error!("error attaching tenant: {:?}", e);
                     }
                 }
@@ -860,7 +860,7 @@ impl Tenant {
                 match tenant_clone.load().await {
                     Ok(()) => {}
                     Err(err) => {
-                        tenant_clone.set_broken();
+                        tenant_clone.set_broken(&err.to_string());
                         error!("could not load tenant {tenant_id}: {err:?}");
                     }
                 }
@@ -1496,7 +1496,7 @@ impl Tenant {
         });
     }
 
-    pub fn set_broken(&self) {
+    pub fn set_broken(&self, reason: &str) {
         self.state.send_modify(|current_state| {
             match *current_state {
                 TenantState::Active => {
@@ -1505,18 +1505,22 @@ impl Tenant {
                     // activated should never be marked as broken. We cope with it the best
                     // we can, but it shouldn't happen.
                     *current_state = TenantState::Broken;
-                    warn!("Changing Active tenant to Broken state");
+                    warn!("Changing Active tenant to Broken state, reason: {}", reason);
                 }
                 TenantState::Broken => {
                     // This shouldn't happen either
-                    warn!("Tenant is already broken");
+                    warn!("Tenant is already in Broken state");
                 }
                 TenantState::Stopping => {
                     // This shouldn't happen either
                     *current_state = TenantState::Broken;
-                    warn!("Marking Stopping tenant as Broken");
+                    warn!(
+                        "Marking Stopping tenant as Broken state, reason: {}",
+                        reason
+                    );
                 }
                 TenantState::Loading | TenantState::Attaching => {
+                    info!("Setting tenant as Broken state, reason: {}", reason);
                     *current_state = TenantState::Broken;
                 }
             }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 44849de735..af7794490a 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -430,7 +430,7 @@ where
         Err(e) => {
             let tenants_accessor = TENANTS.read().await;
             match tenants_accessor.get(&tenant_id) {
-                Some(tenant) => tenant.set_broken(),
+                Some(tenant) => tenant.set_broken(&e.to_string()),
                 None => warn!("Tenant {tenant_id} got removed from memory"),
             }
             Err(e)

From 93c77b0383add7f4faa1c8cb71490bc08ccc8526 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 9 Jan 2023 15:40:14 +0400
Subject: [PATCH 33/42] Use GHA environment for per-region deploy approvals on
 staging (#3293)

Each main deploy will wait for manual approval for each region
---
 .github/workflows/build_and_test.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2b0b0ba2bf..8123e3cbd4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -839,7 +839,9 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        target_region: [ us-east-2 ]
+        target_region: [ eu-west-1, us-east-2 ]
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1041,6 +1043,8 @@ jobs:
             target_cluster: dev-eu-west-1-zeta
             deploy_link_proxy: false
             deploy_legacy_scram_proxy: false
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1088,6 +1092,8 @@ jobs:
             target_cluster: dev-us-east-2-beta
           - target_region:  eu-west-1
             target_cluster: dev-eu-west-1-zeta
+    environment:
+      name: dev-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3

From 3a22e1335d1a49f79a9d30d323ee2808246ad131 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 9 Jan 2023 14:15:53 +0200
Subject: [PATCH 34/42] Adding a PR template (#3288)

## Describe your changes
Added a PR template
## Issue ticket number and link
#3162
## Checklist before requesting a review
- [ ] I have performed a self-review of my code
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 .github/PULL_REQUEST_TEMPLATE/pull_request_template.md | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .github/PULL_REQUEST_TEMPLATE/pull_request_template.md

diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
new file mode 100644
index 0000000000..3f32b80ca8
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -0,0 +1,10 @@
+## Describe your changes
+
+## Issue ticket number and link
+
+## Checklist before requesting a review
+- [ ] I have performed a self-review of my code.
+- [ ] If it is a core feature, I have added thorough tests.
+- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
+- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
+

From a457256fef5819f288b3cf660b04f26d36587b36 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Mon, 9 Jan 2023 14:25:12 +0200
Subject: [PATCH 35/42] Fix log message matching (#3291)

Spotted
https://neon-github-public-dev.s3.amazonaws.com/reports/main/debug/3871991071/index.html#suites/158be07438eb5188d40b466b6acfaeb3/22966d740e33b677/
failing on `main`, fixes that by using a proper regex match string.

Also removes one clippy lint suppression.
---
 pageserver/src/tenant/timeline.rs     | 4 ++--
 test_runner/fixtures/neon_fixtures.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 477108ec4c..0d8a5fc800 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1710,8 +1710,8 @@ impl Timeline {
                 continue 'outer;
             }
 
-            #[allow(unused_labels, clippy::never_loop)] // see comment at bottom of this loop
-            'layer_map_search: loop {
+            #[allow(clippy::never_loop)] // see comment at bottom of this loop
+            '_layer_map_search: loop {
                 let remote_layer = {
                     let layers = timeline.layers.read().unwrap();
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index bdd3dc004e..f284be8753 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1923,7 +1923,7 @@ class NeonPageserver(PgProtocol):
             ".*kill_and_wait_impl.*: wait successful.*",
             ".*Replication stream finished: db error: ERROR: Socket IO error: end streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
-            ".*query handler for 'pagestream.*failed: Connection reset by peer (os error 104).*",  # pageserver notices compute shut down
+            ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
             # and streaming start
             ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",

From d4d0aa6ed6c9f0408723c923df01f7718d42b1b0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 2 Jan 2023 18:44:51 +0100
Subject: [PATCH 36/42] gc_iteration_internal: better log message & debug log
 level if nothing to do

fixes https://github.com/neondatabase/neon/issues/3107
---
 pageserver/src/tenant.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 71cdc6cf1c..d74f263f08 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1874,7 +1874,12 @@ impl Tenant {
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
-        info!("starting on {} timelines", gc_timelines.len());
+        // If there is nothing to GC, we don't want any messages in the INFO log.
+        if !gc_timelines.is_empty() {
+            info!("{} timelines need GC", gc_timelines.len());
+        } else {
+            debug!("{} timelines need GC", gc_timelines.len());
+        }
 
         // Perform GC for each timeline.
         //

From 14df37c108f2accc11f6f0cd2c588a64ed48cdfa Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 9 Jan 2023 20:18:16 +0400
Subject: [PATCH 37/42] Use GHA environments for gradual prod rollout (#3295)

Each release will wait for manual approval for each region
---
 .github/workflows/build_and_test.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8123e3cbd4..1512c7b9aa 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,6 +794,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -913,6 +915,8 @@ jobs:
     strategy:
       matrix:
         target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -952,6 +956,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     env:
       KUBECONFIG: .kubeconfig
     steps:
@@ -995,6 +1001,8 @@ jobs:
     strategy:
       matrix:
         include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    environment:
+      name: prod-old
     env:
       KUBECONFIG: .kubeconfig
     steps:
@@ -1132,6 +1140,8 @@ jobs:
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1
             target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -1171,6 +1181,8 @@ jobs:
             target_cluster: prod-eu-central-1-gamma
           - target_region: ap-southeast-1
             target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
     steps:
       - name: Checkout
         uses: actions/checkout@v3

From 8c07ef413ddac2f1bdfd37a078c343b5c4183c73 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Jan 2023 14:54:30 +0200
Subject: [PATCH 38/42] Minor cleanup of test_ondemand_download_timetravel
 test.

- Fix and improve comments
- Rename 'physical_size' local variable to 'resident_size' for clarity.
- Remove one 'unnecessary wait_for_upload' call. The
  'wait_for_sk_commit_lsn_to_reach_remote_storage' call after shutting
  down compute is sufficient.
---
 test_runner/regress/test_ondemand_download.py | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 352ae4b95c..184dc13888 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -120,7 +120,7 @@ def test_ondemand_download_large_rel(
 
 
 #
-# If you have a relation with a long history of updates,the pageserver downloads the layer
+# If you have a relation with a long history of updates, the pageserver downloads the layer
 # files containing the history as needed by timetravel queries.
 #
 @pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -189,13 +189,10 @@ def test_ondemand_download_timetravel(
         # run checkpoint manually to be sure that data landed in remote storage
         client.timeline_checkpoint(tenant_id, timeline_id)
 
-    # wait until pageserver successfully uploaded a checkpoint to remote storage
-    wait_for_upload(client, tenant_id, timeline_id, current_lsn)
-    log.info("uploads have finished")
-
     ##### Stop the first pageserver instance, erase all its data
     env.postgres.stop_all()
 
+    # wait until pageserver has successfully uploaded all the data to remote storage
     wait_for_sk_commit_lsn_to_reach_remote_storage(
         tenant_id, timeline_id, env.safekeepers, env.pageserver
     )
@@ -227,11 +224,15 @@ def test_ondemand_download_timetravel(
 
     wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active"))
 
-    # current_physical_size reports sum of layer file sizes, regardless of local or remote
+    # The current_physical_size reports the sum of layers loaded in the layer
+    # map, regardless of where the layer files are located. So even though we
+    # just removed the local files, they still count towards
+    # current_physical_size because they are loaded as `RemoteLayer`s.
     assert filled_current_physical == get_api_current_physical_size()
 
+    # Run queries at different points in time
     num_layers_downloaded = [0]
-    physical_size = [get_resident_physical_size()]
+    resident_size = [get_resident_physical_size()]
     for (checkpoint_number, lsn) in lsns:
         pg_old = env.postgres.create_start(
             branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn
@@ -268,13 +269,15 @@ def test_ondemand_download_timetravel(
         if len(num_layers_downloaded) > 4:
             assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4]
 
-        # Likewise, assert that the physical_size metric grows as layers are downloaded
-        physical_size.append(get_resident_physical_size())
-        log.info(f"physical_size[-1]={physical_size[-1]}")
-        if len(physical_size) > 4:
-            assert physical_size[-1] > physical_size[len(physical_size) - 4]
+        # Likewise, assert that the resident_physical_size metric grows as layers are downloaded
+        resident_size.append(get_resident_physical_size())
+        log.info(f"resident_size[-1]={resident_size[-1]}")
+        if len(resident_size) > 4:
+            assert resident_size[-1] > resident_size[len(resident_size) - 4]
 
-        # current_physical_size reports sum of layer file sizes, regardless of local or remote
+        # current_physical_size reports the total size of all layer files, whether
+        # they are present only in the remote storage, only locally, or both.
+        # It should not change.
         assert filled_current_physical == get_api_current_physical_size()
 
 

From 8eebd5f039f8bf59216f4830aa5a5178eb855e22 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 4 Jan 2023 14:50:13 +0100
Subject: [PATCH 39/42] run on-demand compaction in a task_mgr task

With this patch, tenant_detach and timeline_delete's
task_mgr::shutdown_tasks() call will wait for on-demand
compaction to finish.
Before this patch, the on-demand compaction would grab the
layer_removal_cs after tenant_detach / timeline_delete had
removed the timeline directory.
This resulted in error

  No such file or directory (os error 2)

NB: I already implemented this pattern for ondemand GC a while back.

fixes https://github.com/neondatabase/neon/issues/3136
---
 pageserver/src/http/routes.rs | 16 +++++------
 pageserver/src/tenant/mgr.rs  | 50 +++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 4f4c397abe..1c5eacd362 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -738,17 +738,17 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::NotFound)?;
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
-        .map_err(ApiError::NotFound)?;
-    timeline
-        .compact()
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
         .await
+        .context("spawn compaction task")
         .map_err(ApiError::InternalServerError)?;
 
+    let result: anyhow::Result<()> = result_receiver
+        .await
+        .context("receive compaction result")
+        .map_err(ApiError::InternalServerError)?;
+    result.map_err(ApiError::InternalServerError)?;
+
     json_response(StatusCode::OK, ())
 }
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index af7794490a..dce7cd8bae 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -492,3 +492,53 @@ pub async fn immediate_gc(
 
     Ok(wait_task_done)
 }
+
+#[cfg(feature = "testing")]
+pub async fn immediate_compact(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
+    let guard = TENANTS.read().await;
+
+    let tenant = guard
+        .get(&tenant_id)
+        .map(Arc::clone)
+        .with_context(|| format!("Tenant {tenant_id} not found"))
+        .map_err(ApiError::NotFound)?;
+
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(ApiError::NotFound)?;
+
+    // Run in task_mgr to avoid race with detach operation
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::Compaction,
+        Some(tenant_id),
+        Some(timeline_id),
+        &format!(
+            "timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
+        ),
+        false,
+        async move {
+            let result = timeline
+                .compact()
+                .instrument(
+                    info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
+                )
+                .await;
+
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send compaction result: {result:?}"),
+            }
+            Ok(())
+        },
+    );
+
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);
+
+    Ok(wait_task_done)
+}

From 0807522a6433e9697ba9311bbe3a5a22f4ed1b59 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 9 Jan 2023 23:56:12 +0400
Subject: [PATCH 40/42] Enable wss proxy in all regions (#3292)

Follow-up to https://github.com/neondatabase/helm-charts/pull/24 and
#3247
---
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml    | 2 ++
 .../helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 2 ++
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml    | 2 ++
 .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml           | 2 ++
 .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml   | 2 ++
 .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml  | 2 ++
 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml    | 2 ++
 .github/helm-values/production.proxy-scram.yaml                 | 2 ++
 8 files changed, 16 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index ae9c1f2e40..08304503c5 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index a2f932e4fb..be0fc329c9 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index 1138536e94..b7f712585b 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
   sentryEnvironment: "development"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.build
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index 4e4aff1f9e..e9e89aff7c 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 94290a87e1..5366ba4ae5 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 1a4023708b..e71e457f13 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 2942d6a2aa..9afe94edd1 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -9,6 +9,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 # -- Additional labels for neon-proxy pods
 podLabels:
@@ -23,6 +24,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
+  httpsPort: 443
 
 #metrics:
 #  enabled: true
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index c7143cd61a..8143f7e575 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -3,6 +3,7 @@ settings:
   authEndpoint: "http://console-release.local/management/api/v2"
   domain: "*.cloud.neon.tech"
   sentryEnvironment: "production"
+  wssPort: 8443
 
 podLabels:
   zenith_service: proxy-scram
@@ -16,6 +17,7 @@ exposedService:
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
     external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
+  httpsPort: 443
 
 metrics:
   enabled: true

From 80d4afab0c78883a77bb927cc867677fc93b5a44 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Mon, 9 Jan 2023 22:28:23 +0100
Subject: [PATCH 41/42] Update tokio version (RUSTSEC-2023-0001)

---
 Cargo.lock                | 7 +++----
 workspace_hack/Cargo.toml | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 284a111ba7..1649e28faa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3731,9 +3731,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.21.1"
+version = "1.24.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95"
+checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
 dependencies = [
  "autocfg",
  "bytes",
@@ -3741,12 +3741,11 @@ dependencies = [
  "memchr",
  "mio",
  "num_cpus",
- "once_cell",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "winapi",
+ "windows-sys 0.42.0",
 ]
 
 [[package]]
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 989cc9202e..3aff839b81 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -40,7 +40,7 @@ scopeguard = { version = "1", features = ["use_std"] }
 serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
 serde_json = { version = "1", features = ["raw_value", "std"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
+tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
 tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
 tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
 tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] }

From 95bf19b85a06b27a7fc3118dee03d48648efab15 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 10 Jan 2023 14:05:27 +0400
Subject: [PATCH 42/42] Add --atomic to all helm upgrade operations (#3299)

When number of github actions workers is changed, some jobs get killed.
When helm if killed during the upgrade, release stuck in pending-upgrade
state. --atomic should initiate automatic rollback in this case.
---
 .github/workflows/build_and_test.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1512c7b9aa..1bbba8e3fd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -983,8 +983,8 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker:
     name: deploy storage broker on old staging and old prod
@@ -1068,19 +1068,19 @@ jobs:
       - name: Re-deploy scram proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy link proxy
         if: matrix.deploy_link_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
       - name: Re-deploy legacy scram proxy
         if: matrix.deploy_legacy_scram_proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker-dev-new:
     runs-on: [ self-hosted, dev, x64 ]
@@ -1157,7 +1157,7 @@ jobs:
       - name: Re-deploy proxy
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
 
   deploy-storage-broker-prod-new:
     runs-on: prod