From 4957bb2d4879f44c80b24dad1df3fd0457acb670 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 3 Jul 2023 13:53:57 +0300
Subject: [PATCH 01/32] fix(proxy): stray span enter globbers up logs (#4612)

Prod logs have deep accidential span nesting. Introduced in #3759 and
has been untouched since, maybe no one watches proxy logs? :) I found it
by accident when looking to see if proxy logs have ansi colors with
`{neon_service="proxy"}`.

The solution is to mostly stop using `Span::enter` or `Span::entered` in
async code. Kept on `Span::entered` in cancel on shutdown related path.
---
 proxy/src/console/mgmt.rs | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 30364be6f4..35d1ff59b7 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -8,7 +8,7 @@ use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, Quer
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
 use std::future;
 use tokio::net::{TcpListener, TcpStream};
-use tracing::{error, info, info_span};
+use tracing::{error, info, info_span, Instrument};
 
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
 
@@ -44,19 +44,30 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<()> {
             .set_nodelay(true)
             .context("failed to set client socket option")?;
 
-        tokio::task::spawn(async move {
-            let span = info_span!("mgmt", peer = %peer_addr);
-            let _enter = span.enter();
+        let span = info_span!("mgmt", peer = %peer_addr);
 
-            info!("started a new console management API thread");
-            scopeguard::defer! {
-                info!("console management API thread is about to finish");
-            }
+        tokio::task::spawn(
+            async move {
+                info!("serving a new console management API connection");
 
-            if let Err(e) = handle_connection(socket).await {
-                error!("thread failed with an error: {e}");
+                // these might be long running connections, have a separate logging for cancelling
+                // on shutdown and other ways of stopping.
+                let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
+                    let _e = span.entered();
+                    info!("console management API task cancelled");
+                });
+
+                if let Err(e) = handle_connection(socket).await {
+                    error!("serving failed with an error: {e}");
+                } else {
+                    info!("serving completed");
+                }
+
+                // we can no longer get dropped
+                scopeguard::ScopeGuard::into_inner(cancelled);
             }
-        });
+            .instrument(span),
+        );
     }
 }
 
@@ -77,14 +88,14 @@ impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
         pgb: &mut PostgresBackendTCP,
         query: &str,
     ) -> Result<(), QueryError> {
-        try_process_query(pgb, query).await.map_err(|e| {
+        try_process_query(pgb, query).map_err(|e| {
             error!("failed to process response: {e:?}");
             e
         })
     }
 }
 
-async fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
+fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), QueryError> {
     let resp: KickSession = serde_json::from_str(query).context("Failed to parse query as json")?;
 
     let span = info_span!("event", session_id = resp.session_id);

From 86604b3b7dd4a3f51bee4a8b07bd44b74682c0fb Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Mon, 3 Jul 2023 07:37:30 -0400
Subject: [PATCH 02/32] Delete Unnecessary files in Extension Bucket (#4606)

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/build_and_test.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 068e34420f..cd4906579e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -967,6 +967,10 @@ jobs:
           rm -rf ./extensions-to-upload/share/extension/neon*
           rm -rf ./extensions-to-upload/lib/neon*
 
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
           docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
           for EXT_NAME in $(ls ./custom-extensions); do
             mkdir -p ./extensions-to-upload/${EXT_NAME}/share

From 26828560a82dadb47585c8aabfd8b569acd44448 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 3 Jul 2023 15:20:49 +0300
Subject: [PATCH 03/32]  Add timeouts and retries to consumption metrics
 reporting client (#4563)

## Problem
#4528
## Summary of changes
Add a 60 seconds default timeout to the reqwest client
Add retries for up to 3 times to call into the metric consumption
endpoint

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Cargo.lock                            | 94 ++++++++++++++++++++++++---
 Cargo.toml                            |  1 +
 pageserver/src/consumption_metrics.rs | 74 ++++++++++++++-------
 proxy/Cargo.toml                      |  1 +
 proxy/src/http.rs                     | 21 ++++++
 proxy/src/metrics.rs                  | 58 +++++++++--------
 6 files changed, 190 insertions(+), 59 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4be74614c2..c3dc0243dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1140,7 +1140,7 @@ dependencies = [
  "crossterm_winapi",
  "libc",
  "mio",
- "parking_lot",
+ "parking_lot 0.12.1",
  "signal-hook",
  "signal-hook-mio",
  "winapi",
@@ -1210,7 +1210,7 @@ dependencies = [
  "hashbrown 0.12.3",
  "lock_api",
  "once_cell",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
 ]
 
 [[package]]
@@ -1939,6 +1939,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
 dependencies = [
  "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
 ]
 
 [[package]]
@@ -2629,6 +2632,17 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "parking_lot"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
+dependencies = [
+ "instant",
+ "lock_api",
+ "parking_lot_core 0.8.6",
+]
+
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@@ -2636,7 +2650,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
  "lock_api",
- "parking_lot_core",
+ "parking_lot_core 0.9.7",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc"
+dependencies = [
+ "cfg-if",
+ "instant",
+ "libc",
+ "redox_syscall 0.2.16",
+ "smallvec",
+ "winapi",
 ]
 
 [[package]]
@@ -2957,7 +2985,7 @@ dependencies = [
  "lazy_static",
  "libc",
  "memchr",
- "parking_lot",
+ "parking_lot 0.12.1",
  "procfs",
  "thiserror",
 ]
@@ -3045,7 +3073,7 @@ dependencies = [
  "native-tls",
  "once_cell",
  "opentelemetry",
- "parking_lot",
+ "parking_lot 0.12.1",
  "pin-project-lite",
  "postgres-native-tls",
  "postgres_backend",
@@ -3056,6 +3084,7 @@ dependencies = [
  "regex",
  "reqwest",
  "reqwest-middleware",
+ "reqwest-retry",
  "reqwest-tracing",
  "routerify",
  "rstest",
@@ -3291,6 +3320,29 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "reqwest-retry"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "chrono",
+ "futures",
+ "getrandom",
+ "http",
+ "hyper",
+ "parking_lot 0.11.2",
+ "reqwest",
+ "reqwest-middleware",
+ "retry-policies",
+ "task-local-extensions",
+ "tokio",
+ "tracing",
+ "wasm-timer",
+]
+
 [[package]]
 name = "reqwest-tracing"
 version = "0.4.4"
@@ -3309,6 +3361,17 @@ dependencies = [
  "tracing-opentelemetry",
 ]
 
+[[package]]
+name = "retry-policies"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b"
+dependencies = [
+ "anyhow",
+ "chrono",
+ "rand",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3518,7 +3581,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
  "postgres",
  "postgres-protocol",
  "postgres_backend",
@@ -3947,7 +4010,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
- "parking_lot",
+ "parking_lot 0.12.1",
  "prost",
  "tokio",
  "tokio-stream",
@@ -4281,7 +4344,7 @@ dependencies = [
  "futures-channel",
  "futures-util",
  "log",
- "parking_lot",
+ "parking_lot 0.12.1",
  "percent-encoding",
  "phf",
  "pin-project-lite",
@@ -4991,6 +5054,21 @@ version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
 
+[[package]]
+name = "wasm-timer"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be0ecb0db480561e9a7642b5d3e4187c128914e58aa84330b9493e3eb68c5e7f"
+dependencies = [
+ "futures",
+ "js-sys",
+ "parking_lot 0.11.2",
+ "pin-utils",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "web-sys"
 version = "0.3.63"
diff --git a/Cargo.toml b/Cargo.toml
index 551a9dc783..6d327f5802 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,6 +95,7 @@ regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
+reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustls = "0.20"
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index ca7b9650e8..f7592afc12 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,6 +24,8 @@ const RESIDENT_SIZE: &str = "resident_size";
 const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
 
+const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+
 #[serde_as]
 #[derive(Serialize, Debug)]
 struct Ids {
@@ -73,7 +75,10 @@ pub async fn collect_metrics(
     );
 
     // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let client = reqwest::ClientBuilder::new()
+        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
+        .build()
+        .expect("Failed to create http client with timeout");
     let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
     let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
 
@@ -83,7 +88,7 @@ pub async fn collect_metrics(
                 info!("collect_metrics received cancellation request");
                 return Ok(());
             },
-            _ = ticker.tick() => {
+            tick_at = ticker.tick() => {
 
                 // send cached metrics every cached_metric_collection_interval
                 let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
@@ -93,6 +98,12 @@ pub async fn collect_metrics(
                 }
 
                 collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    metric_collection_interval,
+                    "consumption_metrics_collect_metrics",
+                );
             }
         }
     }
@@ -273,31 +284,42 @@ pub async fn collect_metrics_iteration(
         })
         .expect("PageserverConsumptionMetric should not fail serialization");
 
-        let res = client
-            .post(metric_collection_endpoint.clone())
-            .json(&chunk_json)
-            .send()
-            .await;
+        const MAX_RETRIES: u32 = 3;
 
-        match res {
-            Ok(res) => {
-                if res.status().is_success() {
-                    // update cached metrics after they were sent successfully
-                    for (curr_key, curr_val) in chunk.iter() {
-                        cached_metrics.insert(curr_key.clone(), *curr_val);
-                    }
-                } else {
-                    error!("metrics endpoint refused the sent metrics: {:?}", res);
-                    for metric in chunk_to_send.iter() {
-                        // Report if the metric value is suspiciously large
-                        if metric.value > (1u64 << 40) {
+        for attempt in 0..MAX_RETRIES {
+            let res = client
+                .post(metric_collection_endpoint.clone())
+                .json(&chunk_json)
+                .send()
+                .await;
+
+            match res {
+                Ok(res) => {
+                    if res.status().is_success() {
+                        // update cached metrics after they were sent successfully
+                        for (curr_key, curr_val) in chunk.iter() {
+                            cached_metrics.insert(curr_key.clone(), *curr_val);
+                        }
+                    } else {
+                        error!("metrics endpoint refused the sent metrics: {:?}", res);
+                        for metric in chunk_to_send
+                            .iter()
+                            .filter(|metric| metric.value > (1u64 << 40))
+                        {
+                            // Report if the metric value is suspiciously large
                             error!("potentially abnormal metric value: {:?}", metric);
                         }
                     }
+                    break;
+                }
+                Err(err) if err.is_timeout() => {
+                    error!(attempt, "timeout sending metrics, retrying immediately");
+                    continue;
+                }
+                Err(err) => {
+                    error!(attempt, ?err, "failed to send metrics");
+                    break;
                 }
-            }
-            Err(err) => {
-                error!("failed to send metrics: {:?}", err);
             }
         }
     }
@@ -317,7 +339,7 @@ pub async fn calculate_synthetic_size_worker(
             _ = task_mgr::shutdown_watcher() => {
                 return Ok(());
             },
-        _ = ticker.tick() => {
+        tick_at = ticker.tick() => {
 
                 let tenants = match mgr::list_tenants().await {
                     Ok(tenants) => tenants,
@@ -343,6 +365,12 @@ pub async fn calculate_synthetic_size_worker(
                     }
 
                 }
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    synthetic_size_calculation_interval,
+                    "consumption_metrics_synthetic_size_worker",
+                );
             }
         }
     }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index e7a4fd236e..d43ea714b1 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -38,6 +38,7 @@ rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 reqwest-middleware.workspace = true
+reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 5cf49b669c..153cdc02e5 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -6,8 +6,11 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;
 
+use std::time::Duration;
+
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
+pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 
 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -21,6 +24,24 @@ pub fn new_client() -> ClientWithMiddleware {
         .build()
 }
 
+pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
+    let timeout_client = reqwest::ClientBuilder::new()
+        .timeout(default_timout)
+        .build()
+        .expect("Failed to create http client with timeout");
+
+    let retry_policy =
+        ExponentialBackoff::builder().build_with_total_retry_duration(default_timout);
+
+    reqwest_middleware::ClientBuilder::new(timeout_client)
+        .with(reqwest_tracing::TracingMiddleware::default())
+        // As per docs, "This middleware always errors when given requests with streaming bodies".
+        // That's all right because we only use this client to send `serde_json::RawValue`, which
+        // is not a stream.
+        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
+        .build()
+}
+
 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
 pub struct Endpoint {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 6ae1e3a447..00fd7f0405 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -4,11 +4,13 @@ use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 use tracing::{error, info, instrument, trace, warn};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
+const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
+
 ///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
@@ -30,7 +32,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
         info!("metrics collector has shut down");
     }
 
-    let http_client = http::new_client();
+    let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
     let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
@@ -182,36 +184,36 @@ async fn collect_metrics_iteration(
             }
         };
 
-        if res.status().is_success() {
-            // update cached metrics after they were sent successfully
-            for send_metric in chunk {
-                let stop_time = match send_metric.kind {
-                    EventType::Incremental { stop_time, .. } => stop_time,
-                    _ => unreachable!(),
-                };
-
-                cached_metrics
-                    .entry(Ids {
-                        endpoint_id: send_metric.extra.endpoint_id.clone(),
-                        branch_id: send_metric.extra.branch_id.clone(),
-                    })
-                    // update cached value (add delta) and time
-                    .and_modify(|e| {
-                        e.0 = e.0.saturating_add(send_metric.value);
-                        e.1 = stop_time
-                    })
-                    // cache new metric
-                    .or_insert((send_metric.value, stop_time));
-            }
-        } else {
+        if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.iter() {
+            for metric in chunk.iter().filter(|metric| metric.value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
-                if metric.value > (1u64 << 40) {
-                    error!("potentially abnormal metric value: {:?}", metric);
-                }
+                error!("potentially abnormal metric value: {:?}", metric);
             }
         }
+        // update cached metrics after they were sent
+        // (to avoid sending the same metrics twice)
+        // see the relevant discussion on why to do so even if the status is not success:
+        // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956
+        for send_metric in chunk {
+            let stop_time = match send_metric.kind {
+                EventType::Incremental { stop_time, .. } => stop_time,
+                _ => unreachable!(),
+            };
+
+            cached_metrics
+                .entry(Ids {
+                    endpoint_id: send_metric.extra.endpoint_id.clone(),
+                    branch_id: send_metric.extra.branch_id.clone(),
+                })
+                // update cached value (add delta) and time
+                .and_modify(|e| {
+                    e.0 = e.0.saturating_add(send_metric.value);
+                    e.1 = stop_time
+                })
+                // cache new metric
+                .or_insert((send_metric.value, stop_time));
+        }
     }
     Ok(())
 }

From 24eaa3b7ca818558c8255343e8fa134d876449fe Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 3 Jul 2023 14:21:10 +0200
Subject: [PATCH 04/32] timeline creation: reflect failures due to ancestor LSN
 issues in status code (#4600)

Before, it was a `500` and control plane would retry, wereas it actually
should have stopped retrying.

(Stacked on top of https://github.com/neondatabase/neon/pull/4597 )

fixes https://github.com/neondatabase/neon/issues/4595
part of https://github.com/neondatabase/cloud/issues/5626

---------

Co-authored-by: Shany Pozin <shany@neon.tech>
---
 pageserver/src/http/openapi_spec.yml      |  6 ++++
 pageserver/src/http/routes.rs             |  5 +++
 pageserver/src/tenant.rs                  | 23 +++++++++----
 test_runner/fixtures/pageserver/http.py   | 19 +++++++++--
 test_runner/regress/test_branch_and_gc.py | 10 +++++-
 test_runner/regress/test_branch_behind.py | 41 +++++++++++++++++++----
 6 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 33a2e32141..290492203a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -722,6 +722,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ForbiddenError"
+        "406":
+          description: Permanently unsatisfiable request, don't retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
         "409":
           description: Timeline already exists, creation skipped
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 08d3fcf8df..58dcbb2aac 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -338,6 +338,11 @@ async fn timeline_create_handler(
             Err(tenant::CreateTimelineError::AlreadyExists) => {
                 json_response(StatusCode::CONFLICT, ())
             }
+            Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
+                json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
+                    format!("{err:#}")
+                ))
+            }
             Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
         }
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 339291149f..3fa2a4bab4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -506,6 +506,8 @@ pub enum CreateTimelineError {
     #[error("a timeline with the given ID already exists")]
     AlreadyExists,
     #[error(transparent)]
+    AncestorLsn(anyhow::Error),
+    #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
@@ -1432,7 +1434,7 @@ impl Tenant {
                     let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                     if ancestor_ancestor_lsn > *lsn {
                         // can we safely just branch from the ancestor instead?
-                        return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                             "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
                             lsn,
                             ancestor_timeline_id,
@@ -2715,7 +2717,7 @@ impl Tenant {
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let tl = self
             .branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
             .await?;
@@ -2732,7 +2734,7 @@ impl Tenant {
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
         self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
             .await
     }
@@ -2743,7 +2745,7 @@ impl Tenant {
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
         _ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let src_id = src_timeline.timeline_id;
 
         // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
@@ -2783,16 +2785,17 @@ impl Tenant {
             .context(format!(
                 "invalid branch start lsn: less than latest GC cutoff {}",
                 *latest_gc_cutoff_lsn,
-            ))?;
+            ))
+            .map_err(CreateTimelineError::AncestorLsn)?;
 
         // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
             if start_lsn < cutoff {
-                bail!(format!(
+                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                     "invalid branch start lsn: less than planned GC cutoff {cutoff}"
-                ));
+                )));
             }
         }
 
@@ -3825,6 +3828,9 @@ mod tests {
         {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
+                let CreateTimelineError::AncestorLsn(err) = err else {
+                    panic!("wrong error type")
+                };
                 assert!(err.to_string().contains("invalid branch start lsn"));
                 assert!(err
                     .source()
@@ -3854,6 +3860,9 @@ mod tests {
         {
             Ok(_) => panic!("branching should have failed"),
             Err(err) => {
+                let CreateTimelineError::AncestorLsn(err) = err else {
+                    panic!("wrong error type");
+                };
                 assert!(&err.to_string().contains("invalid branch start lsn"));
                 assert!(&err
                     .source()
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 5c4f5177d0..824d11cb17 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -21,6 +21,18 @@ class PageserverApiException(Exception):
         self.status_code = status_code
 
 
+class TimelineCreate406(PageserverApiException):
+    def __init__(self, res: requests.Response):
+        assert res.status_code == 406
+        super().__init__(res.json()["msg"], res.status_code)
+
+
+class TimelineCreate409(PageserverApiException):
+    def __init__(self, res: requests.Response):
+        assert res.status_code == 409
+        super().__init__("", res.status_code)
+
+
 @dataclass
 class InMemoryLayerInfo:
     kind: str
@@ -309,9 +321,12 @@ class PageserverHttpClient(requests.Session):
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
         )
-        self.verbose_error(res)
         if res.status_code == 409:
-            raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
+            raise TimelineCreate409(res)
+        if res.status_code == 406:
+            raise TimelineCreate406(res)
+
+        self.verbose_error(res)
 
         res_json = res.json()
         assert isinstance(res_json, dict)
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index 4a03421fcf..53e67b1592 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -4,7 +4,8 @@ import time
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.types import Lsn
+from fixtures.pageserver.http import TimelineCreate406
+from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 
 
@@ -173,5 +174,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
     with pytest.raises(Exception, match="invalid branch start lsn: .*"):
         env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn)
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(
+            f"Expecting failure for branch behind gc'ing LSN, new_timeline_id={new_timeline_id}"
+        )
+        pageserver_http_client.timeline_create(env.pg_version, tenant, new_timeline_id, b0, lsn)
 
     thread.join()
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 3f7d49ab03..a19b2862f8 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -1,6 +1,7 @@
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.pageserver.http import TimelineCreate406
 from fixtures.types import Lsn, TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
@@ -17,7 +18,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
 
     # Branch at the point where only 100 rows were inserted
-    env.neon_cli.create_branch("test_branch_behind")
+    branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
     endpoint_main = env.endpoints.create_start("test_branch_behind")
     log.info("postgres is running on 'test_branch_behind' branch")
 
@@ -89,6 +90,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100
 
     # Check bad lsn's for branching
+    pageserver_http = env.pageserver.http_client()
 
     # branch at segment boundary
     env.neon_cli.create_branch(
@@ -97,27 +99,52 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     endpoint = env.endpoints.create_start("test_branch_segment_boundary")
     assert endpoint.safe_psql("SELECT 1")[0][0] == 1
 
-    # branch at pre-initdb lsn
+    # branch at pre-initdb lsn (from main branch)
     with pytest.raises(Exception, match="invalid branch start lsn: .*"):
         env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42"))
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Expecting failure for branch pre-initdb LSN, new_timeline_id={new_timeline_id}")
+        pageserver_http.timeline_create(
+            env.pg_version, env.initial_tenant, new_timeline_id, env.initial_timeline, Lsn("0/42")
+        )
 
     # branch at pre-ancestor lsn
     with pytest.raises(Exception, match="less than timeline ancestor lsn"):
         env.neon_cli.create_branch(
             "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42")
         )
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(
+            f"Expecting failure for branch pre-ancestor LSN, new_timeline_id={new_timeline_id}"
+        )
+        pageserver_http.timeline_create(
+            env.pg_version,
+            env.initial_tenant,
+            new_timeline_id,
+            branch_behind_timeline_id,
+            Lsn("0/42"),
+        )
 
     # check that we cannot create branch based on garbage collected data
-    with env.pageserver.http_client() as pageserver_http:
-        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
-        gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
-        print_gc_result(gc_result)
-
+    pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
+    gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
+    print_gc_result(gc_result)
     with pytest.raises(Exception, match="invalid branch start lsn: .*"):
         # this gced_lsn is pretty random, so if gc is disabled this woudln't fail
         env.neon_cli.create_branch(
             "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn
         )
+    # retry the same with the HTTP API, so that we can inspect the status code
+    with pytest.raises(TimelineCreate406):
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Expecting failure for branch behind gc'd LSN, new_timeline_id={new_timeline_id}")
+        pageserver_http.timeline_create(
+            env.pg_version, env.initial_tenant, new_timeline_id, branch_behind_timeline_id, gced_lsn
+        )
 
     # check that after gc everything is still there
     assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100

From 78a7f68902b6abc577759abb53edcc7c7c316fcd Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 3 Jul 2023 13:51:40 +0100
Subject: [PATCH 05/32] Make pg_version and build_type regular parameters
 (#4311)

## Problem

All tests have already been parametrised by Postgres version and build
type (to have them distinguishable in the Allure report), but despite
it, it's anyway required to have DEFAULT_PG_VERSION and BUILD_TYPE env
vars set to corresponding values, for example to
run`test_timeline_deletion_with_files_stuck_in_upload_queue[release-pg14-local_fs]`
test it's required to set `DEFAULT_PG_VERSION=14` and
`BUILD_TYPE=release`.
This PR makes the test framework pick up parameters from the test name
itself.

## Summary of changes
- Postgres version and build type related fixtures now are
function-scoped (instead of being sessions scoped before)
- Deprecate `--pg-version` argument in favour of DEFAULT_PG_VERSION env
variable (it's easier to parse)
- GitHub autocomment now includes only one command with all the failed
tests + runs them in parallel
---
 scripts/comment-test-report.js        | 26 ++++++++------
 test_runner/conftest.py               |  2 +-
 test_runner/fixtures/allure.py        | 25 --------------
 test_runner/fixtures/neon_fixtures.py | 27 +++++++++++----
 test_runner/fixtures/parametrize.py   | 50 +++++++++++++++++++++++++++
 test_runner/fixtures/pg_version.py    | 22 ++++--------
 6 files changed, 94 insertions(+), 58 deletions(-)
 delete mode 100644 test_runner/fixtures/allure.py
 create mode 100644 test_runner/fixtures/parametrize.py

diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js
index 432c78d1af..dd60d42a37 100755
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -144,17 +144,24 @@ const reportSummary = async (params) => {
                 }
                 summary += `- \`${testName}\`: ${links.join(", ")}\n`
             }
-
-            const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name)
-            const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"`
-
-            summary += "```\n"
-            summary += `# Run failed on Postgres ${pgVersion} tests locally:\n`
-            summary += `${command}\n`
-            summary += "```\n"
         }
     }
 
+    if (failedTestsCount > 0) {
+        const testsToRerun = []
+        for (const pgVersion of Object.keys(failedTests)) {
+            for (const testName of Object.keys(failedTests[pgVersion])) {
+                testsToRerun.push(...failedTests[pgVersion][testName].map(test => test.name))
+            }
+        }
+        const command = `scripts/pytest -vv -n $(nproc) -k "${testsToRerun.join(" or ")}"`
+
+        summary += "```\n"
+        summary += `# Run all failed tests locally:\n`
+        summary += `${command}\n`
+        summary += "```\n"
+    }
+
     if (flakyTestsCount > 0) {
         summary += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
         for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
@@ -164,8 +171,7 @@ const reportSummary = async (params) => {
                     const links = []
                     for (const test of tests) {
                         const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}/retries`
-                        const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
-                        links.push(`[${status} ${test.buildType}](${allureLink})`)
+                        links.push(`[${test.buildType}](${allureLink})`)
                     }
                     summary += `- \`${testName}\`: ${links.join(", ")}\n`
                 }
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4e649e111a..1c36c1ed02 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,6 +1,6 @@
 pytest_plugins = (
     "fixtures.pg_version",
-    "fixtures.allure",
+    "fixtures.parametrize",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/allure.py b/test_runner/fixtures/allure.py
deleted file mode 100644
index 6f40bd2aa2..0000000000
--- a/test_runner/fixtures/allure.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import os
-
-import pytest
-
-from fixtures.pg_version import DEFAULT_VERSION, PgVersion
-
-"""
-Set of utilities to make Allure report more informative.
-
-- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
-to make tests distinguishable in Allure report.
-"""
-
-
-@pytest.fixture(scope="function", autouse=True)
-def allure_noop():
-    pass
-
-
-def pytest_generate_tests(metafunc):
-    if "test_runner/regress" in metafunc.definition._nodeid:
-        build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
-        pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
-
-        metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e56bf78019..c3e9853978 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -102,8 +102,8 @@ def base_dir() -> Iterator[Path]:
     yield base_dir
 
 
-@pytest.fixture(scope="session")
-def neon_binpath(base_dir: Path) -> Iterator[Path]:
+@pytest.fixture(scope="function")
+def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]:
     if os.getenv("REMOTE_ENV"):
         # we are in remote env and do not have neon binaries locally
         # this is the case for benchmarks run on self-hosted runner
@@ -113,7 +113,6 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]:
     if env_neon_bin := os.environ.get("NEON_BIN"):
         binpath = Path(env_neon_bin)
     else:
-        build_type = os.environ.get("BUILD_TYPE", "debug")
         binpath = base_dir / "target" / build_type
     log.info(f"neon_binpath is {binpath}")
 
@@ -123,7 +122,7 @@ def neon_binpath(base_dir: Path) -> Iterator[Path]:
     yield binpath
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
     if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"):
         distrib_dir = Path(env_postgres_bin).resolve()
@@ -147,7 +146,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:
     yield output_dir
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
     versioned_dir = pg_distrib_dir / pg_version.v_prefixed
 
@@ -174,7 +173,23 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
     def myfixture(...)
        ...
     """
-    return "function" if os.environ.get("TEST_SHARED_FIXTURES") is None else "session"
+    scope: Literal["session", "function"]
+
+    if os.environ.get("TEST_SHARED_FIXTURES") is None:
+        # Create the environment in the per-test output directory
+        scope = "function"
+    elif (
+        os.environ.get("BUILD_TYPE") is not None
+        and os.environ.get("DEFAULT_PG_VERSION") is not None
+    ):
+        scope = "session"
+    else:
+        pytest.fail(
+            "Shared environment(TEST_SHARED_FIXTURES) requires BUILD_TYPE and DEFAULT_PG_VERSION to be set",
+            pytrace=False,
+        )
+
+    return scope
 
 
 @pytest.fixture(scope="session")
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
new file mode 100644
index 0000000000..53350138dd
--- /dev/null
+++ b/test_runner/fixtures/parametrize.py
@@ -0,0 +1,50 @@
+import os
+from typing import Optional
+
+import pytest
+from _pytest.fixtures import FixtureRequest
+from _pytest.python import Metafunc
+
+from fixtures.pg_version import PgVersion
+
+"""
+Dynamically parametrize tests by Postgres version and build type (debug/release/remote)
+"""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pg_version(request: FixtureRequest) -> Optional[PgVersion]:
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in str(request.node.path):
+        v = os.environ.get("DEFAULT_PG_VERSION")
+        return PgVersion(v)
+
+    return None
+
+
+@pytest.fixture(scope="function", autouse=True)
+def build_type(request: FixtureRequest) -> Optional[str]:
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in str(request.node.path):
+        return os.environ.get("BUILD_TYPE", "").lower()
+
+    return None
+
+
+def pytest_generate_tests(metafunc: Metafunc):
+    # Do not parametrize performance tests yet, we need to prepare grafana charts first
+    if "test_runner/performance" in metafunc.definition._nodeid:
+        return
+
+    if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
+        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
+    else:
+        pg_versions = [PgVersion(v)]
+
+    if (bt := os.environ.get("BUILD_TYPE")) is None:
+        build_types = ["debug", "release"]
+    else:
+        build_types = [bt.lower()]
+
+    metafunc.parametrize("build_type", build_types)
+    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 14ae88cc2c..b61f52be3c 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -1,12 +1,10 @@
 import enum
 import os
-from typing import Iterator, Optional
+from typing import Optional
 
 import pytest
+from _pytest.config import Config
 from _pytest.config.argparsing import Parser
-from pytest import FixtureRequest
-
-from fixtures.log_helper import log
 
 """
 This fixture is used to determine which version of Postgres to use for tests.
@@ -75,18 +73,10 @@ def pytest_addoption(parser: Parser):
         "--pg-version",
         action="store",
         type=PgVersion,
-        help="Postgres version to use for tests",
+        help="DEPRECATED: Postgres version to use for tests",
     )
 
 
-@pytest.fixture(scope="session")
-def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
-    if v := request.config.getoption("--pg-version"):
-        version, source = v, "from --pg-version command-line argument"
-    elif v := os.environ.get("DEFAULT_PG_VERSION"):
-        version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
-    else:
-        version, source = DEFAULT_VERSION, "default version"
-
-    log.info(f"pg_version is {version} ({source})")
-    yield version
+def pytest_configure(config: Config):
+    if config.getoption("--pg-version"):
+        raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")

From cff7ae0b0defcdc15200579f85b938c8d3e4976a Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 3 Jul 2023 16:37:02 +0300
Subject: [PATCH 06/32] fix: no more ansi colored logs (#4613)

Allure does not support ansi colored logs, yet `compute_ctl` has them.

Upgrade criterion to get rid of atty dependency, disable ansi colors,
remove atty dependency and disable ansi feature of tracing-subscriber.

This is a heavy-handed approach. I am not aware of a workflow where
you'd want to connect a terminal directly to for example `compute_ctl`,
usually you find the logs in a file. If someone had been using colors,
they will now need to:
- turn the `tracing-subscriber.default-features` to `true`
- edit their wanted project to have colors

I decided to explicitly disable ansi colors in case we would have in
future a dependency accidentally enabling the feature on
`tracing-subscriber`, which would be quite surprising but not
unimagineable.

By getting rid of `atty` from dependencies we get rid of
<https://github.com/advisories/GHSA-g98v-hv3f-hcfr>.
---
 Cargo.lock                  | 104 ++++++------------------------------
 Cargo.toml                  |   5 +-
 compute_tools/src/logger.rs |   1 +
 libs/utils/Cargo.toml       |   1 -
 libs/utils/src/logging.rs   |   2 +-
 proxy/Cargo.toml            |   1 -
 proxy/src/logging.rs        |   2 +-
 workspace_hack/Cargo.toml   |   2 +-
 8 files changed, 22 insertions(+), 96 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c3dc0243dd..0c0ed530fb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -200,17 +200,6 @@ dependencies = [
  "critical-section",
 ]
 
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi 0.1.19",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -805,18 +794,6 @@ dependencies = [
  "libloading",
 ]
 
-[[package]]
-name = "clap"
-version = "3.2.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
-dependencies = [
- "bitflags",
- "clap_lex 0.2.4",
- "indexmap",
- "textwrap",
-]
-
 [[package]]
 name = "clap"
 version = "4.3.0"
@@ -837,7 +814,7 @@ dependencies = [
  "anstream",
  "anstyle",
  "bitflags",
- "clap_lex 0.5.0",
+ "clap_lex",
  "strsim",
 ]
 
@@ -853,15 +830,6 @@ dependencies = [
  "syn 2.0.16",
 ]
 
-[[package]]
-name = "clap_lex"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
-dependencies = [
- "os_str_bytes",
-]
-
 [[package]]
 name = "clap_lex"
 version = "0.5.0"
@@ -915,7 +883,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "compute_api",
  "futures",
  "hyper",
@@ -977,7 +945,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.3.0",
+ "clap",
  "comfy-table",
  "compute_api",
  "git-version",
@@ -1047,19 +1015,19 @@ dependencies = [
 
 [[package]]
 name = "criterion"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
 dependencies = [
  "anes",
- "atty",
  "cast",
  "ciborium",
- "clap 3.2.25",
+ "clap",
  "criterion-plot",
+ "is-terminal",
  "itertools",
- "lazy_static",
  "num-traits",
+ "once_cell",
  "oorandom",
  "plotters",
  "rayon",
@@ -1676,15 +1644,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
-[[package]]
-name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "hermit-abi"
 version = "0.2.6"
@@ -2270,16 +2229,6 @@ dependencies = [
  "windows-sys 0.45.0",
 ]
 
-[[package]]
-name = "nu-ansi-term"
-version = "0.46.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
-dependencies = [
- "overload",
- "winapi",
-]
-
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2507,31 +2456,19 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "os_str_bytes"
-version = "6.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
-
 [[package]]
 name = "outref"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
 
-[[package]]
-name = "overload"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
-
 [[package]]
 name = "pagectl"
 version = "0.1.0"
 dependencies = [
  "anyhow",
  "bytes",
- "clap 4.3.0",
+ "clap",
  "git-version",
  "pageserver",
  "postgres_ffi",
@@ -2550,7 +2487,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -3050,12 +2987,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
- "atty",
  "base64 0.13.1",
  "bstr",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -3570,7 +3506,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "const_format",
  "crc32c",
  "fs2",
@@ -4000,7 +3936,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.3.0",
+ "clap",
  "const_format",
  "futures",
  "futures-core",
@@ -4181,12 +4117,6 @@ dependencies = [
  "syn 1.0.109",
 ]
 
-[[package]]
-name = "textwrap"
-version = "0.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
-
 [[package]]
 name = "thiserror"
 version = "1.0.40"
@@ -4602,7 +4532,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.3.0",
+ "clap",
  "pageserver_api",
  "utils",
  "workspace_hack",
@@ -4704,7 +4634,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
- "nu-ansi-term",
  "once_cell",
  "regex",
  "serde",
@@ -4873,7 +4802,6 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
- "atty",
  "bincode",
  "byteorder",
  "bytes",
@@ -4950,7 +4878,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.3.0",
+ "clap",
  "env_logger",
  "log",
  "once_cell",
@@ -5330,7 +5258,7 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.3.0",
+ "clap",
  "clap_builder",
  "crossbeam-utils",
  "either",
diff --git a/Cargo.toml b/Cargo.toml
index 6d327f5802..4b9bd8b961 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,7 +34,6 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-stream = "0.3"
 async-trait = "0.1"
-atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
@@ -129,7 +128,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.18.0"
-tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
@@ -171,7 +170,7 @@ utils = { version = "0.1", path = "./libs/utils/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
-criterion = "0.4"
+criterion = "0.5.1"
 rcgen = "0.10"
 rstest = "0.17"
 tempfile = "3.4"
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index f6fc882968..3ae68de8ef 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -18,6 +18,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
         .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
 
     let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(false)
         .with_target(false)
         .with_writer(std::io::stderr);
 
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 8239ffff57..87b0082356 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
-atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 2b8c852d86..7b32bf2841 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -84,7 +84,7 @@ pub fn init(
     let r = r.with({
         let log_layer = tracing_subscriber::fmt::layer()
             .with_target(false)
-            .with_ansi(atty::is(atty::Stream::Stdout))
+            .with_ansi(false)
             .with_writer(std::io::stdout);
         let log_layer = match log_format {
             LogFormat::Json => log_layer.json().boxed(),
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index d43ea714b1..c8b0cc7cf0 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -7,7 +7,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
-atty.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 0c8c2858b9..3405b8cbc6 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -18,7 +18,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
         .from_env_lossy();
 
     let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_ansi(false)
         .with_writer(std::io::stderr)
         .with_target(false);
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 677b59f453..63a65d3889 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -54,7 +54,7 @@ toml_edit = { version = "0.19", features = ["serde"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
-tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]

From 10110bee69d323b234875e1355ce169bf054f0bf Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 3 Jul 2023 16:10:44 +0100
Subject: [PATCH 07/32] fix setup instructions (#4615)

## Problem

1. The local endpoints provision 2 ports (postgres and HTTP) which means
the migration_check endpoint has a different port than what the setup
implies
2. psycopg2-binary 2.9.3 has a deprecated poetry config and doesn't
install.

## Summary of changes

Update psycopg2-binary and update the endpoint ports in the readme

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 README.md      |  16 +++----
 poetry.lock    | 123 +++++++++++++++++++++++++------------------------
 pyproject.toml |   2 +-
 3 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/README.md b/README.md
index efa714e5be..1c2452f435 100644
--- a/README.md
+++ b/README.md
@@ -132,13 +132,13 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > cargo neon init
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Initializing pageserver node 1 at '127.0.0.1:64000' in ".neon"
 
 # start pageserver, safekeeper, and broker for their intercommunication
 > cargo neon start
-Starting neon broker at 127.0.0.1:50051
+Starting neon broker at 127.0.0.1:50051.
 storage_broker started, pid: 2918372
-Starting pageserver at '127.0.0.1:64000' in '.neon'.
+Starting pageserver node 1 at '127.0.0.1:64000' in ".neon".
 pageserver started, pid: 2918386
 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437
@@ -152,8 +152,7 @@ Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
-Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'
 
 # check list of running postgres instances
 > cargo neon endpoint list
@@ -189,18 +188,17 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 # start postgres on that branch
 > cargo neon endpoint start migration_check --branch-name migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
-Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
-Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
+Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
 
 # check the new list of running postgres instances
 > cargo neon endpoint list
  ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
  main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
- migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
+ migration_check  127.0.0.1:55434  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
 
 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
  key | value
 -----+-------
diff --git a/poetry.lock b/poetry.lock
index 8dc45f68b8..aa76ac6dbd 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1654,71 +1654,74 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
 [[package]]
 name = "psycopg2-binary"
-version = "2.9.3"
+version = "2.9.6"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_ppc64le.whl", hash = "sha256:0a29729145aaaf1ad8bafe663131890e2111f13416b60e460dae0a96af5905c9"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a79d622f5206d695d7824cbf609a4f5b88ea6d6dab5f7c147fc6d333a8787e4"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:090f3348c0ab2cceb6dfbe6bf721ef61262ddf518cd6cc6ecc7d334996d64efa"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a9e1f75f96ea388fbcef36c70640c4efbe4650658f3d6a2967b4cc70e907352e"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c3ae8e75eb7160851e59adc77b3a19a976e50622e44fd4fd47b8b18208189d42"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win32.whl", hash = "sha256:7b1e9b80afca7b7a386ef087db614faebbf8839b7f4db5eb107d0f1a53225029"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:8b344adbb9a862de0c635f4f0425b7958bf5a4b927c8594e6e8d261775796d53"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:e847774f8ffd5b398a75bc1c18fbb56564cda3d629fe68fd81971fece2d3c67e"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:68641a34023d306be959101b345732360fc2ea4938982309b786f7be1b43a4a1"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3303f8807f342641851578ee7ed1f3efc9802d00a6f83c101d21c608cb864460"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_aarch64.whl", hash = "sha256:e3699852e22aa68c10de06524a3721ade969abf382da95884e6a10ff798f9281"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-manylinux_2_24_ppc64le.whl", hash = "sha256:526ea0378246d9b080148f2d6681229f4b5964543c170dd10bf4faaab6e0d27f"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:b1c8068513f5b158cf7e29c43a77eb34b407db29aca749d3eb9293ee0d3103ca"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:15803fa813ea05bef089fa78835118b5434204f3a17cb9f1e5dbfd0b9deea5af"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:152f09f57417b831418304c7f30d727dc83a12761627bb826951692cc6491e57"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:404224e5fef3b193f892abdbf8961ce20e0b6642886cfe1fe1923f41aaa75c9d"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win32.whl", hash = "sha256:1f6b813106a3abdf7b03640d36e24669234120c72e91d5cbaeb87c5f7c36c65b"},
-    {file = "psycopg2_binary-2.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:2d872e3c9d5d075a2e104540965a1cf898b52274a5923936e5bfddb58c59c7c2"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:10bb90fb4d523a2aa67773d4ff2b833ec00857f5912bafcfd5f5414e45280fb1"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:874a52ecab70af13e899f7847b3e074eeb16ebac5615665db33bce8a1009cf33"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a29b3ca4ec9defec6d42bf5feb36bb5817ba3c0230dd83b4edf4bf02684cd0ae"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:12b11322ea00ad8db8c46f18b7dfc47ae215e4df55b46c67a94b4effbaec7094"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-manylinux_2_24_ppc64le.whl", hash = "sha256:53293533fcbb94c202b7c800a12c873cfe24599656b341f56e71dd2b557be063"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c381bda330ddf2fccbafab789d83ebc6c53db126e4383e73794c74eedce855ef"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9d29409b625a143649d03d0fd7b57e4b92e0ecad9726ba682244b73be91d2fdb"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:183a517a3a63503f70f808b58bfbf962f23d73b6dccddae5aa56152ef2bcb232"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:15c4e4cfa45f5a60599d9cec5f46cd7b1b29d86a6390ec23e8eebaae84e64554"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
-    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_ppc64le.whl", hash = "sha256:63638d875be8c2784cfc952c9ac34e2b50e43f9f0a0660b65e2a87d656b3116c"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ffb7a888a047696e7f8240d649b43fb3644f14f0ee229077e7f6b9f9081635bd"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c9d5450c566c80c396b7402895c4369a410cab5a82707b11aee1e624da7d004"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:d1c1b569ecafe3a69380a94e6ae09a4789bbb23666f3d3a08d06bbd2451f5ef1"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8fc53f9af09426a61db9ba357865c77f26076d48669f2e1bb24d85a22fb52307"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_ppc64le.whl", hash = "sha256:7af0dd86ddb2f8af5da57a976d27cd2cd15510518d582b478fbb2292428710b4"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:93cd1967a18aa0edd4b95b1dfd554cf15af657cb606280996d393dadc88c3c35"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bda845b664bb6c91446ca9609fc69f7db6c334ec5e4adc87571c34e4f47b7ddb"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:01310cf4cf26db9aea5158c217caa92d291f0500051a6469ac52166e1a16f5b7"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:99485cab9ba0fa9b84f1f9e1fef106f44a46ef6afdeec8885e0b88d0772b49e8"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win32.whl", hash = "sha256:46f0e0a6b5fa5851bbd9ab1bc805eef362d3a230fbdfbc209f4a236d0a7a990d"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:accfe7e982411da3178ec690baaceaad3c278652998b2c45828aaac66cd8285f"},
+    {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"},
+    {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"},
+    {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"},
+    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"},
+    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"},
+    {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"},
+    {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"},
 ]
 
 [[package]]
diff --git a/pyproject.toml b/pyproject.toml
index 2c21af6982..ac4e8fa2dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ authors = []
 [tool.poetry.dependencies]
 python = "^3.9"
 pytest = "^7.3.1"
-psycopg2-binary = "^2.9.1"
+psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.31.0"

From 9c8c55e8191fed0c07efebc01cff286d58e6c32d Mon Sep 17 00:00:00 2001
From: arpad-m <arpad-m@users.noreply.github.com>
Date: Mon, 3 Jul 2023 19:34:07 +0200
Subject: [PATCH 08/32] Add _cached and _bytes to
 pageserver_tenant_synthetic_size metric name (#4616)

This renames the `pageserver_tenant_synthetic_size` metric to
`pageserver_tenant_synthetic_cached_size_bytes`, as was requested on
slack (link in the linked issue).

* `_cached` to hint that it is not incrementally calculated
* `_bytes` to indicate the unit the size is measured in

Fixes #3748
---
 pageserver/src/metrics.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 00745143bd..8765aa6aff 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -204,11 +204,11 @@ pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
 
 pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
-        "pageserver_tenant_synthetic_size",
-        "Synthetic size of each tenant",
+        "pageserver_tenant_synthetic_cached_size_bytes",
+        "Synthetic size of each tenant in bytes",
         &["tenant_id"]
     )
-    .expect("Failed to register pageserver_tenant_synthetic_size metric")
+    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });
 
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,

From ab2ea8cfa53cd32b9075f7c1af32325137cddccf Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 4 Jul 2023 14:54:59 +0100
Subject: [PATCH 09/32] use pbkdf2 crate (#4626)

## Problem

While pbkdf2 is a simple algorithm, we should probably use a well tested
implementation

## Summary of changes

* Use pbkdf2 crate
* Use arrays like the hmac comment says

## Checklist before requesting a review

- [X] I have performed a self-review of my code.
- [X] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 Cargo.lock                  | 11 ++++++
 Cargo.toml                  |  1 +
 proxy/Cargo.toml            |  1 +
 proxy/src/scram.rs          | 71 +++++++++++++++++++++++++++++++++----
 proxy/src/scram/password.rs | 52 ++++++++++++++++++++-------
 5 files changed, 116 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0c0ed530fb..b163d4fe46 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2617,6 +2617,16 @@ dependencies = [
  "windows-sys 0.45.0",
 ]
 
+[[package]]
+name = "pbkdf2"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+dependencies = [
+ "digest",
+ "hmac",
+]
+
 [[package]]
 name = "peeking_take_while"
 version = "0.1.2"
@@ -3010,6 +3020,7 @@ dependencies = [
  "once_cell",
  "opentelemetry",
  "parking_lot 0.12.1",
+ "pbkdf2",
  "pin-project-lite",
  "postgres-native-tls",
  "postgres_backend",
diff --git a/Cargo.toml b/Cargo.toml
index 4b9bd8b961..f36e8f6569 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -86,6 +86,7 @@ opentelemetry = "0.18.0"
 opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
+pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index c8b0cc7cf0..438dd62315 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -29,6 +29,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
+pbkdf2.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index 7cc4191435..85854427ed 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -45,17 +45,74 @@ fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator<Item = &'a [u8]>) -> [u8
     let mut mac = Hmac::<Sha256>::new_from_slice(key).expect("bad key size");
     parts.into_iter().for_each(|s| mac.update(s));
 
-    // TODO: maybe newer `hmac` et al already migrated to regular arrays?
-    let mut result = [0u8; 32];
-    result.copy_from_slice(mac.finalize().into_bytes().as_slice());
-    result
+    mac.finalize().into_bytes().into()
 }
 
 fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
     let mut hasher = Sha256::new();
     parts.into_iter().for_each(|s| hasher.update(s));
 
-    let mut result = [0u8; 32];
-    result.copy_from_slice(hasher.finalize().as_slice());
-    result
+    hasher.finalize().into()
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::sasl::{Mechanism, Step};
+
+    use super::{password::SaltedPassword, Exchange, ServerSecret};
+
+    #[test]
+    fn happy_path() {
+        let iterations = 4096;
+        let salt_base64 = "QSXCR+Q6sek8bf92";
+        let pw = SaltedPassword::new(
+            b"pencil",
+            base64::decode(salt_base64).unwrap().as_slice(),
+            iterations,
+        );
+
+        let secret = ServerSecret {
+            iterations,
+            salt_base64: salt_base64.to_owned(),
+            stored_key: pw.client_key().sha256(),
+            server_key: pw.server_key(),
+            doomed: false,
+        };
+        const NONCE: [u8; 18] = [
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+        ];
+        let mut exchange = Exchange::new(&secret, || NONCE, None);
+
+        let client_first = "n,,n=user,r=rOprNGfwEbeRWgbNEkqO";
+        let client_final = "c=biws,r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,p=rw1r5Kph5ThxmaUBC2GAQ6MfXbPnNkFiTIvdb/Rear0=";
+        let server_first =
+            "r=rOprNGfwEbeRWgbNEkqOAQIDBAUGBwgJCgsMDQ4PEBES,s=QSXCR+Q6sek8bf92,i=4096";
+        let server_final = "v=qtUDIofVnIhM7tKn93EQUUt5vgMOldcDVu1HC+OH0o0=";
+
+        exchange = match exchange.exchange(client_first).unwrap() {
+            Step::Continue(exchange, message) => {
+                assert_eq!(message, server_first);
+                exchange
+            }
+            Step::Success(_, _) => panic!("expected continue, got success"),
+            Step::Failure(f) => panic!("{f}"),
+        };
+
+        let key = match exchange.exchange(client_final).unwrap() {
+            Step::Success(key, message) => {
+                assert_eq!(message, server_final);
+                key
+            }
+            Step::Continue(_, _) => panic!("expected success, got continue"),
+            Step::Failure(f) => panic!("{f}"),
+        };
+
+        assert_eq!(
+            key.as_bytes(),
+            [
+                74, 103, 1, 132, 12, 31, 200, 48, 28, 54, 82, 232, 207, 12, 138, 189, 40, 32, 134,
+                27, 125, 170, 232, 35, 171, 167, 166, 41, 70, 228, 182, 112,
+            ]
+        );
+    }
 }
diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs
index 656780d853..022f2842dd 100644
--- a/proxy/src/scram/password.rs
+++ b/proxy/src/scram/password.rs
@@ -14,19 +14,7 @@ impl SaltedPassword {
     /// See `scram-common.c : scram_SaltedPassword` for details.
     /// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
     pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
-        let one = 1_u32.to_be_bytes(); // magic
-
-        let mut current = super::hmac_sha256(password, [salt, &one]);
-        let mut result = current;
-        for _ in 1..iterations {
-            current = super::hmac_sha256(password, [current.as_ref()]);
-            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
-            for (i, x) in current.iter().enumerate() {
-                result[i] ^= x;
-            }
-        }
-
-        result.into()
+        pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
     }
 
     /// Derive `ClientKey` from a salted hashed password.
@@ -46,3 +34,41 @@ impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
         Self { bytes }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::SaltedPassword;
+
+    fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
+        let one = 1_u32.to_be_bytes(); // magic
+
+        let mut current = super::super::hmac_sha256(password, [salt, &one]);
+        let mut result = current;
+        for _ in 1..iterations {
+            current = super::super::hmac_sha256(password, [current.as_ref()]);
+            // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
+            for (i, x) in current.iter().enumerate() {
+                result[i] ^= x;
+            }
+        }
+
+        result.into()
+    }
+
+    #[test]
+    fn pbkdf2() {
+        let password = "a-very-secure-password";
+        let salt = "such-a-random-salt";
+        let iterations = 4096;
+        let output = [
+            203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
+            101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
+        ];
+
+        let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
+        let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
+
+        assert_eq!(actual.bytes, output);
+        assert_eq!(actual.bytes, expected.bytes);
+    }
+}

From 10aba174c999c795eb3b2170cfca5597530a7eab Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 4 Jul 2023 17:40:51 +0300
Subject: [PATCH 10/32] metrics: Remove comments regarding upgradeable rwlocks
 (#4622)

Closes #4001 by removing the comments alluding towards
upgradeable/downgradeable RwLocks.
---
 pageserver/src/metrics.rs      | 5 -----
 pageserver/src/virtual_file.rs | 9 ---------
 2 files changed, 14 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 8765aa6aff..8f85488aec 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -968,7 +968,6 @@ impl RemoteTimelineClientMetrics {
         op_kind: &RemoteOpKind,
         status: &'static str,
     ) -> Histogram {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.remote_operation_time.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str(), status);
         let metric = guard.entry(key).or_insert_with(move || {
@@ -990,7 +989,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntGauge {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.calls_unfinished_gauge.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
@@ -1011,7 +1009,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> Histogram {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.calls_started_hist.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
@@ -1032,7 +1029,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntCounter {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.bytes_started_counter.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
@@ -1053,7 +1049,6 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> IntCounter {
-        // XXX would be nice to have an upgradable RwLock
         let mut guard = self.bytes_finished_counter.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
         let metric = guard.entry(key).or_insert_with(move || {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index fb216123c1..b86d14f158 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -302,15 +302,6 @@ impl VirtualFile {
             .observe_closure_duration(|| self.open_options.open(&self.path))?;
 
         // Perform the requested operation on it
-        //
-        // TODO: We could downgrade the locks to read mode before calling
-        // 'func', to allow a little bit more concurrency, but the standard
-        // library RwLock doesn't allow downgrading without releasing the lock,
-        // and that doesn't seem worth the trouble.
-        //
-        // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and
-        // may deadlock on subsequent read calls.
-        // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly.
         let result = STORAGE_IO_TIME
             .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
             .observe_closure_duration(|| func(&file));

From cbf9a408898d7c5b598842cb2d9b91dd1de9a0a8 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 4 Jul 2023 14:28:27 +0300
Subject: [PATCH 11/32] Set a shorter timeout for the initial connection
 attempti in proxy.

In case we try to connect to an outdated address that is no longer valid, the
default behavior of Kubernetes is to drop the packets, causing us to wait for
the entire timeout period. We want to fail fast in such cases.

A specific case to consider is when we have cached compute node information
with a 5-minute TTL (Time To Live), but the user has executed a `/suspend` API
call, resulting in the nonexistence of the compute node.
---
 proxy/src/compute.rs | 13 +++++++------
 proxy/src/config.rs  |  2 +-
 proxy/src/proxy.rs   | 30 +++++++++++++++++++++++++++---
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 480acb88d9..70b29679b9 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -136,18 +136,17 @@ impl Default for ConnCfg {
 
 impl ConnCfg {
     /// Establish a raw TCP connection to the compute node.
-    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream, &str)> {
+    async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
         use tokio_postgres::config::Host;
 
         // wrap TcpStream::connect with timeout
         let connect_with_timeout = |host, port| {
-            let connection_timeout = Duration::from_millis(10000);
-            tokio::time::timeout(connection_timeout, TcpStream::connect((host, port))).map(
+            tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
                 move |res| match res {
                     Ok(tcpstream_connect_res) => tcpstream_connect_res,
                     Err(_) => Err(io::Error::new(
                         io::ErrorKind::TimedOut,
-                        format!("exceeded connection timeout {connection_timeout:?}"),
+                        format!("exceeded connection timeout {timeout:?}"),
                     )),
                 },
             )
@@ -223,8 +222,9 @@ impl ConnCfg {
     async fn do_connect(
         &self,
         allow_self_signed_compute: bool,
+        timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
-        let (socket_addr, stream, host) = self.connect_raw().await?;
+        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
 
         let tls_connector = native_tls::TlsConnector::builder()
             .danger_accept_invalid_certs(allow_self_signed_compute)
@@ -264,8 +264,9 @@ impl ConnCfg {
     pub async fn connect(
         &self,
         allow_self_signed_compute: bool,
+        timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
-        self.do_connect(allow_self_signed_compute)
+        self.do_connect(allow_self_signed_compute, timeout)
             .inspect_err(|err| {
                 // Immediately log the error we have at our disposal.
                 error!("couldn't connect to compute node: {err}");
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 6a26cea78e..00f561fcf2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -212,7 +212,7 @@ pub struct CacheOptions {
 
 impl CacheOptions {
     /// Default options for [`crate::auth::caches::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=5m";
+    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";
 
     /// Parse cache options passed via cmdline.
     /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 8efb7005c8..2433412c4c 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -16,7 +16,10 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio::{
+    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
+    time,
+};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
@@ -305,12 +308,13 @@ pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
 #[tracing::instrument(name = "connect_once", skip_all)]
 async fn connect_to_compute_once(
     node_info: &console::CachedNodeInfo,
+    timeout: time::Duration,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
     let allow_self_signed_compute = node_info.allow_self_signed_compute;
 
     node_info
         .config
-        .connect(allow_self_signed_compute)
+        .connect(allow_self_signed_compute, timeout)
         .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
         .await
 }
@@ -328,7 +332,27 @@ async fn connect_to_compute(
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
-        match connect_to_compute_once(node_info).await {
+
+        // Set a shorter timeout for the initial connection attempt.
+        //
+        // In case we try to connect to an outdated address that is no longer valid, the
+        // default behavior of Kubernetes is to drop the packets, causing us to wait for
+        // the entire timeout period. We want to fail fast in such cases.
+        //
+        // A specific case to consider is when we have cached compute node information
+        // with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API
+        // call, resulting in the nonexistence of the compute node.
+        //
+        // We only use caching in case of scram proxy backed by the console, so reduce
+        // the timeout only in that case.
+        let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
+        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
+            time::Duration::from_secs(2)
+        } else {
+            time::Duration::from_secs(10)
+        };
+
+        match connect_to_compute_once(node_info, timeout).await {
             Err(e) if num_retries > 0 => {
                 info!("compute node's state has changed; requesting a wake-up");
                 match creds.wake_compute(extra).map_err(io_error).await? {

From c7143dbde6db30c14807de0c5f70a0599f268e97 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Tue, 4 Jul 2023 19:07:36 -0400
Subject: [PATCH 12/32] compute_ctl: Fix misleading metric (#4608)

---
 compute_tools/src/compute.rs            | 9 +++++++--
 libs/compute_api/src/responses.rs       | 1 +
 test_runner/performance/test_startup.py | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 70d83a7b47..aec4e49725 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -516,9 +516,9 @@ impl ComputeNode {
         self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
-
         let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
 
+        let config_time = Utc::now();
         if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
             self.apply_config(&compute_state)?;
         }
@@ -526,11 +526,16 @@ impl ComputeNode {
         let startup_end_time = Utc::now();
         {
             let mut state = self.state.lock().unwrap();
-            state.metrics.config_ms = startup_end_time
+            state.metrics.start_postgres_ms = config_time
                 .signed_duration_since(start_time)
                 .to_std()
                 .unwrap()
                 .as_millis() as u64;
+            state.metrics.config_ms = startup_end_time
+                .signed_duration_since(config_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
             state.metrics.total_startup_ms = startup_end_time
                 .signed_duration_since(compute_state.start_time)
                 .to_std()
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index ce73dda08a..80e5341216 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -71,6 +71,7 @@ pub struct ComputeMetrics {
     pub wait_for_spec_ms: u64,
     pub sync_safekeepers_ms: u64,
     pub basebackup_ms: u64,
+    pub start_postgres_ms: u64,
     pub config_ms: u64,
     pub total_startup_ms: u64,
 }
diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py
index 8babbbe132..4744c1ed2e 100644
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -52,6 +52,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
             "wait_for_spec_ms": f"{i}_wait_for_spec",
             "sync_safekeepers_ms": f"{i}_sync_safekeepers",
             "basebackup_ms": f"{i}_basebackup",
+            "start_postgres_ms": f"{i}_start_postgres",
             "config_ms": f"{i}_config",
             "total_startup_ms": f"{i}_total_startup",
         }

From 3f9defbfb4428bb087b2d899e8b609f89bc01b7d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 5 Jul 2023 10:38:32 +0200
Subject: [PATCH 13/32] page cache: add access & hit rate metrics (#4628)

Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
---
 pageserver/src/metrics.rs       | 73 +++++++++++++++++++++++++++++++++
 pageserver/src/page_cache.rs    | 41 +++++++++++++++++-
 test_runner/fixtures/metrics.py |  2 +
 3 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 8f85488aec..db5bccdbba 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -130,6 +130,79 @@ pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub struct PageCacheMetrics {
+    pub read_accesses_materialized_page: IntCounter,
+    pub read_accesses_ephemeral: IntCounter,
+    pub read_accesses_immutable: IntCounter,
+
+    pub read_hits_ephemeral: IntCounter,
+    pub read_hits_immutable: IntCounter,
+    pub read_hits_materialized_page_exact: IntCounter,
+    pub read_hits_materialized_page_older_lsn: IntCounter,
+}
+
+static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_cache_read_hits_total",
+        "Number of read accesses to the page cache that hit",
+        &["key_kind", "hit_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_cache_read_accesses_total",
+        "Number of read accesses to the page cache",
+        &["key_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+    read_accesses_materialized_page: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+
+    read_accesses_ephemeral: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+
+    read_accesses_immutable: {
+        PAGE_CACHE_READ_ACCESSES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+
+    read_hits_ephemeral: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["ephemeral", "-"])
+            .unwrap()
+    },
+
+    read_hits_immutable: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["immutable", "-"])
+            .unwrap()
+    },
+
+    read_hits_materialized_page_exact: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["materialized_page", "exact"])
+            .unwrap()
+    },
+
+    read_hits_materialized_page_older_lsn: {
+        PAGE_CACHE_READ_HITS
+            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
+            .unwrap()
+    },
+});
+
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_wait_lsn_seconds",
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index d2fe06697e..ef0e748d10 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -313,6 +313,10 @@ impl PageCache {
         key: &Key,
         lsn: Lsn,
     ) -> Option<(Lsn, PageReadGuard)> {
+        crate::metrics::PAGE_CACHE
+            .read_accesses_materialized_page
+            .inc();
+
         let mut cache_key = CacheKey::MaterializedPage {
             hash_key: MaterializedPageHashKey {
                 tenant_id,
@@ -323,8 +327,21 @@ impl PageCache {
         };
 
         if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
-                Some((lsn, guard))
+            if let CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: available_lsn,
+            } = cache_key
+            {
+                if available_lsn == lsn {
+                    crate::metrics::PAGE_CACHE
+                        .read_hits_materialized_page_exact
+                        .inc();
+                } else {
+                    crate::metrics::PAGE_CACHE
+                        .read_hits_materialized_page_older_lsn
+                        .inc();
+                }
+                Some((available_lsn, guard))
             } else {
                 panic!("unexpected key type in slot");
             }
@@ -499,11 +516,31 @@ impl PageCache {
     /// ```
     ///
     fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+        let (read_access, hit) = match cache_key {
+            CacheKey::MaterializedPage { .. } => {
+                unreachable!("Materialized pages use lookup_materialized_page")
+            }
+            CacheKey::EphemeralPage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
+                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
+            ),
+            CacheKey::ImmutableFilePage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.read_hits_immutable,
+            ),
+        };
+        read_access.inc();
+
+        let mut is_first_iteration = true;
         loop {
             // First check if the key already exists in the cache.
             if let Some(read_guard) = self.try_lock_for_read(cache_key) {
+                if is_first_iteration {
+                    hit.inc();
+                }
                 return Ok(ReadBufResult::Found(read_guard));
             }
+            is_first_iteration = false;
 
             // Not found. Find a victim buffer
             let (slot_idx, mut inner) =
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 7ee3c33f92..a2bc2e28e5 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -59,6 +59,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "libmetrics_tracing_event_count_total",
     "pageserver_materialized_cache_hits_total",
     "pageserver_materialized_cache_hits_direct_total",
+    "pageserver_page_cache_read_hits_total",
+    "pageserver_page_cache_read_accesses_total",
     "pageserver_getpage_reconstruct_seconds_bucket",
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",

From f1db87ac361a43e4758a25c982ba4008ba290ce4 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 5 Jul 2023 11:40:38 +0300
Subject: [PATCH 14/32] Check if there is enough memory for HNSW index (#4602)

## Problem

HNSW index is created in memory.
Try to prevent OOM by checking of available RAM.


## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/hnsw/hnsw.c       | 39 +++++++++++++++++++++++++++++++++++++++
 pgxn/hnsw/hnsw.control |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
index 434f4986f8..45bf78ed3b 100644
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -122,6 +122,43 @@ hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
 						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
 }
 
+#ifdef __APPLE__
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static void
+hnsw_check_available_memory(Size requested)
+{
+	size_t total;
+	if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
+		elog(ERROR, "Failed to get amount of RAM: %m");
+
+	if ((Size)NBuffers*BLCKSZ + requested >= total)
+		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
+			requested, total - (Size)NBuffers*BLCKSZ);
+}
+
+#else
+
+#include <sys/sysinfo.h>
+
+static void
+hnsw_check_available_memory(Size requested)
+{
+	struct sysinfo si;
+	Size total;
+	if (sysinfo(&si) < 0)
+		elog(ERROR, "Failed to get amount of RAM: %n");
+
+	total = si.totalram*si.mem_unit;
+	if ((Size)NBuffers*BLCKSZ + requested >= total)
+		elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
+			requested, total - (Size)NBuffers*BLCKSZ);
+}
+
+#endif
+
 static HierarchicalNSW*
 hnsw_get_index(Relation indexRel, Relation heapRel)
 {
@@ -156,6 +193,8 @@ hnsw_get_index(Relation indexRel, Relation heapRel)
 		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
 		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
 
+		hnsw_check_available_memory(shmem_size);
+
 		/* first try to attach to existed index */
 		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
 						 &mapped_address, &mapped_size, DEBUG1))
diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control
index b292b96026..8b75c350a8 100644
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -1,4 +1,4 @@
-comment = 'hNsw index'
+comment = 'hnsw index'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true

From dbf88cf2d72c2298ee6606bf89c5b6bf854fc195 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 4 Jul 2023 13:27:08 +0300
Subject: [PATCH 15/32] Minimalistic pool for http endpoint compute connections
 (under opt-in flag)

Cache up to 20 connections per endpoint. Once all pooled connections
are used current implementation can open an extra connection, so the
maximum number of simultaneous connections is not enforced.

There are more things to do here, especially with background clean-up
of closed connections, and checks for transaction state. But current
implementation allows to check for smaller coonection latencies that
this cache should bring.
---
 proxy/src/http.rs               |   1 +
 proxy/src/http/conn_pool.rs     | 278 ++++++++++++++++++++++++++++++++
 proxy/src/http/sql_over_http.rs | 131 +++------------
 proxy/src/http/websocket.rs     |  12 +-
 4 files changed, 307 insertions(+), 115 deletions(-)
 create mode 100644 proxy/src/http/conn_pool.rs

diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index 153cdc02e5..e281240380 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -2,6 +2,7 @@
 //! Other modules should use stuff from this module instead of
 //! directly relying on deps like `reqwest` (think loose coupling).
 
+pub mod conn_pool;
 pub mod server;
 pub mod sql_over_http;
 pub mod websocket;
diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
new file mode 100644
index 0000000000..52c1e2f2ce
--- /dev/null
+++ b/proxy/src/http/conn_pool.rs
@@ -0,0 +1,278 @@
+use parking_lot::Mutex;
+use pq_proto::StartupMessageParams;
+use std::fmt;
+use std::{collections::HashMap, sync::Arc};
+
+use futures::TryFutureExt;
+
+use crate::config;
+use crate::{auth, console};
+
+use super::sql_over_http::MAX_RESPONSE_SIZE;
+
+use crate::proxy::invalidate_cache;
+use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
+
+use tracing::error;
+use tracing::info;
+
+pub const APP_NAME: &str = "sql_over_http";
+const MAX_CONNS_PER_ENDPOINT: usize = 20;
+
+#[derive(Debug)]
+pub struct ConnInfo {
+    pub username: String,
+    pub dbname: String,
+    pub hostname: String,
+    pub password: String,
+}
+
+impl ConnInfo {
+    // hm, change to hasher to avoid cloning?
+    pub fn db_and_user(&self) -> (String, String) {
+        (self.dbname.clone(), self.username.clone())
+    }
+}
+
+impl fmt::Display for ConnInfo {
+    // use custom display to avoid logging password
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}@{}/{}", self.username, self.hostname, self.dbname)
+    }
+}
+
+struct ConnPoolEntry {
+    conn: tokio_postgres::Client,
+    _last_access: std::time::Instant,
+}
+
+// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub struct EndpointConnPool {
+    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
+    total_conns: usize,
+}
+
+pub struct GlobalConnPool {
+    // endpoint -> per-endpoint connection pool
+    //
+    // That should be a fairly conteded map, so return reference to the per-endpoint
+    // pool as early as possible and release the lock.
+    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
+
+    // Maximum number of connections per one endpoint.
+    // Can mix different (dbname, username) connections.
+    // When running out of free slots for a particular endpoint,
+    // falls back to opening a new connection for each request.
+    max_conns_per_endpoint: usize,
+
+    proxy_config: &'static crate::config::ProxyConfig,
+}
+
+impl GlobalConnPool {
+    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
+        Arc::new(Self {
+            global_pool: Mutex::new(HashMap::new()),
+            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
+            proxy_config: config,
+        })
+    }
+
+    pub async fn get(
+        &self,
+        conn_info: &ConnInfo,
+        force_new: bool,
+    ) -> anyhow::Result<tokio_postgres::Client> {
+        let mut client: Option<tokio_postgres::Client> = None;
+
+        if !force_new {
+            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+
+            // find a pool entry by (dbname, username) if exists
+            let mut pool = pool.lock();
+            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
+            if let Some(pool_entries) = pool_entries {
+                if let Some(entry) = pool_entries.pop() {
+                    client = Some(entry.conn);
+                    pool.total_conns -= 1;
+                }
+            }
+        }
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                connect_to_compute(self.proxy_config, conn_info).await
+            } else {
+                info!("pool: reusing connection '{conn_info}'");
+                Ok(client)
+            }
+        } else {
+            info!("pool: opening a new connection '{conn_info}'");
+            connect_to_compute(self.proxy_config, conn_info).await
+        }
+    }
+
+    pub async fn put(
+        &self,
+        conn_info: &ConnInfo,
+        client: tokio_postgres::Client,
+    ) -> anyhow::Result<()> {
+        let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+
+        // return connection to the pool
+        let mut total_conns;
+        let mut returned = false;
+        let mut per_db_size = 0;
+        {
+            let mut pool = pool.lock();
+            total_conns = pool.total_conns;
+
+            let pool_entries: &mut Vec<ConnPoolEntry> = pool
+                .pools
+                .entry(conn_info.db_and_user())
+                .or_insert_with(|| Vec::with_capacity(1));
+            if total_conns < self.max_conns_per_endpoint {
+                pool_entries.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
+
+                total_conns += 1;
+                returned = true;
+                per_db_size = pool_entries.len();
+
+                pool.total_conns += 1;
+            }
+        }
+
+        // do logging outside of the mutex
+        if returned {
+            info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+
+        Ok(())
+    }
+
+    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
+        // find or create a pool for this endpoint
+        let mut created = false;
+        let mut global_pool = self.global_pool.lock();
+        let pool = global_pool
+            .entry(endpoint.clone())
+            .or_insert_with(|| {
+                created = true;
+                Arc::new(Mutex::new(EndpointConnPool {
+                    pools: HashMap::new(),
+                    total_conns: 0,
+                }))
+            })
+            .clone();
+        let global_pool_size = global_pool.len();
+        drop(global_pool);
+
+        // log new global pool size
+        if created {
+            info!(
+                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
+            );
+        }
+
+        pool
+    }
+}
+
+//
+// Wake up the destination if needed. Code here is a bit involved because
+// we reuse the code from the usual proxy and we need to prepare few structures
+// that this code expects.
+//
+async fn connect_to_compute(
+    config: &config::ProxyConfig,
+    conn_info: &ConnInfo,
+) -> anyhow::Result<tokio_postgres::Client> {
+    let tls = config.tls_config.as_ref();
+    let common_names = tls.and_then(|tls| tls.common_names.clone());
+
+    let credential_params = StartupMessageParams::new([
+        ("user", &conn_info.username),
+        ("database", &conn_info.dbname),
+        ("application_name", APP_NAME),
+    ]);
+
+    let creds = config
+        .auth_backend
+        .as_ref()
+        .map(|_| {
+            auth::ClientCredentials::parse(
+                &credential_params,
+                Some(&conn_info.hostname),
+                common_names,
+            )
+        })
+        .transpose()?;
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some(APP_NAME),
+    };
+
+    let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
+
+    // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
+    // the difference that it uses `tokio_postgres` for the connection.
+    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    loop {
+        match connect_to_compute_once(node_info, conn_info).await {
+            Err(e) if num_retries > 0 => {
+                info!("compute node's state has changed; requesting a wake-up");
+                match creds.wake_compute(&extra).await? {
+                    // Update `node_info` and try one more time.
+                    Some(new) => {
+                        *node_info = new;
+                    }
+                    // Link auth doesn't work that way, so we just exit.
+                    None => return Err(e),
+                }
+            }
+            other => return other,
+        }
+
+        num_retries -= 1;
+        info!("retrying after wake-up ({num_retries} attempts left)");
+    }
+}
+
+async fn connect_to_compute_once(
+    node_info: &console::CachedNodeInfo,
+    conn_info: &ConnInfo,
+) -> anyhow::Result<tokio_postgres::Client> {
+    let mut config = (*node_info.config).clone();
+
+    let (client, connection) = config
+        .user(&conn_info.username)
+        .password(&conn_info.password)
+        .dbname(&conn_info.dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect(tokio_postgres::NoTls)
+        .inspect_err(|e: &tokio_postgres::Error| {
+            error!(
+                "failed to connect to compute node hosts={:?} ports={:?}: {}",
+                node_info.config.get_hosts(),
+                node_info.config.get_ports(),
+                e
+            );
+            invalidate_cache(node_info)
+        })
+        .await?;
+
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
+    });
+
+    Ok(client)
+}
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index e8ad2d04f3..adf7252f72 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,25 +1,21 @@
+use std::sync::Arc;
+
 use futures::pin_mut;
 use futures::StreamExt;
-use futures::TryFutureExt;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
-use pq_proto::StartupMessageParams;
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::Row;
-use tracing::error;
-use tracing::info;
-use tracing::instrument;
 use url::Url;
 
-use crate::proxy::invalidate_cache;
-use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
-use crate::{auth, config::ProxyConfig, console};
+use super::conn_pool::ConnInfo;
+use super::conn_pool::GlobalConnPool;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -27,12 +23,13 @@ struct QueryData {
     params: Vec<serde_json::Value>,
 }
 
-const APP_NAME: &str = "sql_over_http";
-const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
+
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 
 //
@@ -96,13 +93,6 @@ fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::E
     }
 }
 
-struct ConnInfo {
-    username: String,
-    dbname: String,
-    hostname: String,
-    password: String,
-}
-
 fn get_conn_info(
     headers: &HeaderMap,
     sni_hostname: Option<String>,
@@ -169,50 +159,23 @@ fn get_conn_info(
 
 // TODO: return different http error codes
 pub async fn handle(
-    config: &'static ProxyConfig,
     request: Request<Body>,
     sni_hostname: Option<String>,
+    conn_pool: Arc<GlobalConnPool>,
 ) -> anyhow::Result<Value> {
     //
     // Determine the destination and connection params
     //
     let headers = request.headers();
     let conn_info = get_conn_info(headers, sni_hostname)?;
-    let credential_params = StartupMessageParams::new([
-        ("user", &conn_info.username),
-        ("database", &conn_info.dbname),
-        ("application_name", APP_NAME),
-    ]);
 
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.
     let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
     let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
-    //
-    // Wake up the destination if needed. Code here is a bit involved because
-    // we reuse the code from the usual proxy and we need to prepare few structures
-    // that this code expects.
-    //
-    let tls = config.tls_config.as_ref();
-    let common_names = tls.and_then(|tls| tls.common_names.clone());
-    let creds = config
-        .auth_backend
-        .as_ref()
-        .map(|_| {
-            auth::ClientCredentials::parse(
-                &credential_params,
-                Some(&conn_info.hostname),
-                common_names,
-            )
-        })
-        .transpose()?;
-    let extra = console::ConsoleReqExtra {
-        session_id: uuid::Uuid::new_v4(),
-        application_name: Some(APP_NAME),
-    };
-
-    let mut node_info = creds.wake_compute(&extra).await?.expect("msg");
+    // Allow connection pooling only if explicitly requested
+    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
@@ -235,7 +198,8 @@ pub async fn handle(
     //
     // Now execute the query and return the result
     //
-    let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?;
+    let client = conn_pool.get(&conn_info, !allow_pool).await?;
+
     let row_stream = client.query_raw_txt(query, query_params).await?;
 
     // Manually drain the stream into a vector to leave row_stream hanging
@@ -292,6 +256,13 @@ pub async fn handle(
         .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
     // resulting JSON format is based on the format of node-postgres result
     Ok(json!({
         "command": command_tag_name,
@@ -302,70 +273,6 @@ pub async fn handle(
     }))
 }
 
-/// This function is a copy of `connect_to_compute` from `src/proxy.rs` with
-/// the difference that it uses `tokio_postgres` for the connection.
-#[instrument(skip_all)]
-async fn connect_to_compute(
-    node_info: &mut console::CachedNodeInfo,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
-
-    loop {
-        match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(extra).await? {
-                    // Update `node_info` and try one more time.
-                    Some(new) => {
-                        *node_info = new;
-                    }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
-                }
-            }
-            other => return other,
-        }
-
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
-    }
-}
-
-async fn connect_to_compute_once(
-    node_info: &console::CachedNodeInfo,
-    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
-    let mut config = (*node_info.config).clone();
-
-    let (client, connection) = config
-        .user(&conn_info.username)
-        .password(&conn_info.password)
-        .dbname(&conn_info.dbname)
-        .max_backend_message_size(MAX_RESPONSE_SIZE)
-        .connect(tokio_postgres::NoTls)
-        .inspect_err(|e: &tokio_postgres::Error| {
-            error!(
-                "failed to connect to compute node hosts={:?} ports={:?}: {}",
-                node_info.config.get_hosts(),
-                node_info.config.get_ports(),
-                e
-            );
-            invalidate_cache(node_info)
-        })
-        .await?;
-
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
-    });
-
-    Ok(client)
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index 9f467aceb7..83ba034e57 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -35,7 +35,7 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;
 
-use super::sql_over_http;
+use super::{conn_pool::GlobalConnPool, sql_over_http};
 
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
@@ -164,6 +164,7 @@ async fn serve_websocket(
 async fn ws_handler(
     mut request: Request<Body>,
     config: &'static ProxyConfig,
+    conn_pool: Arc<GlobalConnPool>,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
     sni_hostname: Option<String>,
@@ -192,7 +193,7 @@ async fn ws_handler(
     // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
     // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(config, request, sni_hostname)
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
             .instrument(info_span!("sql-over-http"))
             .await;
         let status_code = match result {
@@ -234,6 +235,8 @@ pub async fn task_main(
         info!("websocket server has shut down");
     }
 
+    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
+
     let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
     let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
         Some(config) => config.into(),
@@ -258,15 +261,18 @@ pub async fn task_main(
     let make_svc =
         hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
             let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+            let conn_pool = conn_pool.clone();
 
             async move {
                 Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
                     let sni_name = sni_name.clone();
+                    let conn_pool = conn_pool.clone();
+
                     async move {
                         let cancel_map = Arc::new(CancelMap::default());
                         let session_id = uuid::Uuid::new_v4();
 
-                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                             .instrument(info_span!(
                                 "ws-client",
                                 session = format_args!("{session_id}")

From c92b7543b5fe70b48d6f71cb9d29b16651154e0c Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 5 Jul 2023 12:39:51 +0200
Subject: [PATCH 16/32] Update `pgvector` to 0.4.4 (#4632)

After announcing `hnsw`, there is a hypothesis that the community will
start comparing it with `pgvector` by themselves. Therefore, let's have
an actual version of `pgvector` in Neon.
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 21877f6f24..310e4c32a3 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -189,8 +189,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
-    echo "b76cf84ddad452cc880a6c8c661d137ddd8679c000a16332f4f03ecf6e10bcc8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
+    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From 7d7cd8375c188bbade6f6761666cd17e2f16bdc4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 5 Jul 2023 14:04:05 +0200
Subject: [PATCH 17/32] callers of task_mgr::spawn: some top-level async blocks
 were missing tenant/timeline id (#4283)

Looking at logs from staging and prod, I found there are a bunch of log
lines without tenant / timeline context.

Manully walk through all task_mgr::spawn lines and fix that using the
least amount of work required.

While doing it, remove some redundant `shutting down` messages.

refs https://github.com/neondatabase/neon/issues/4222
---
 pageserver/src/disk_usage_eviction_task.rs                 | 7 ++++---
 pageserver/src/tenant/timeline/eviction_task.rs            | 6 +++---
 .../tenant/timeline/walreceiver/walreceiver_connection.rs  | 5 +++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 61cbd5066f..d09b661da5 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -110,7 +110,6 @@ pub fn launch_disk_usage_global_eviction_task(
 
             disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                 .await;
-            info!("disk usage based eviction task finishing");
             Ok(())
         },
     );
@@ -126,13 +125,16 @@ async fn disk_usage_eviction_task(
     tenants_dir: &Path,
     cancel: CancellationToken,
 ) {
+    scopeguard::defer! {
+        info!("disk usage based eviction task finishing");
+    };
+
     use crate::tenant::tasks::random_init_delay;
     {
         if random_init_delay(task_config.period, &cancel)
             .await
             .is_err()
         {
-            info!("shutting down");
             return;
         }
     }
@@ -167,7 +169,6 @@ async fn disk_usage_eviction_task(
         tokio::select! {
             _ = tokio::time::sleep_until(sleep_until) => {},
             _ = cancel.cancelled() => {
-                info!("shutting down");
                 break
             }
         }
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 03cf2d89ad..f8b10d906f 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -70,7 +70,6 @@ impl Timeline {
                 };
 
                 self_clone.eviction_task(cancel).await;
-                info!("eviction task finishing");
                 Ok(())
             },
         );
@@ -78,6 +77,9 @@ impl Timeline {
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
     async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        scopeguard::defer! {
+            info!("eviction task finishing");
+        }
         use crate::tenant::tasks::random_init_delay;
         {
             let policy = self.get_eviction_policy();
@@ -86,7 +88,6 @@ impl Timeline {
                 EvictionPolicy::NoEviction => Duration::from_secs(10),
             };
             if random_init_delay(period, &cancel).await.is_err() {
-                info!("shutting down");
                 return;
             }
         }
@@ -101,7 +102,6 @@ impl Timeline {
                 ControlFlow::Continue(sleep_until) => {
                     tokio::select! {
                         _ = cancel.cancelled() => {
-                            info!("shutting down");
                             break;
                         }
                         _ = tokio::time::sleep_until(sleep_until) => { }
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1c1fe87305..817a30247e 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -71,6 +71,8 @@ pub(super) async fn handle_walreceiver_connection(
     ctx: RequestContext,
     node: NodeId,
 ) -> anyhow::Result<()> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
     WALRECEIVER_STARTED_CONNECTIONS.inc();
 
     // Connect to the database in replication mode.
@@ -140,6 +142,9 @@ pub(super) async fn handle_walreceiver_connection(
             }
             Ok(())
         }
+        // Enrich the log lines emitted by this closure with meaningful context.
+        // TODO: technically, this task outlives the surrounding function, so, the
+        // spans won't be properly nested.
         .instrument(tracing::info_span!("poller")),
     );
 

From 1c516906e73b7a18dbf64bc655cad2cd3ce52862 Mon Sep 17 00:00:00 2001
From: arpad-m <arpad-m@users.noreply.github.com>
Date: Wed, 5 Jul 2023 14:27:50 +0200
Subject: [PATCH 18/32] Impl Display for LayerFileName and require it for Layer
 (#4630)

Does three things:

* add a `Display` impl for `LayerFileName` equal to the `short_id`
* based on that, replace the `Layer::short_id` function by a requirement
for a `Display` impl
* use that `Display` impl in the places where the `short_id` and `file_name()` functions were used instead

Fixes #4145
---
 pageserver/src/disk_usage_eviction_task.rs    |  2 +-
 .../src/tenant/remote_timeline_client.rs      |  9 +--
 pageserver/src/tenant/storage_layer.rs        | 13 ++--
 .../src/tenant/storage_layer/delta_layer.rs   |  9 +--
 .../src/tenant/storage_layer/filename.rs      | 10 +++-
 .../src/tenant/storage_layer/image_layer.rs   |  8 ++-
 .../tenant/storage_layer/inmemory_layer.rs    | 16 ++---
 .../src/tenant/storage_layer/layer_desc.rs    |  5 +-
 .../src/tenant/storage_layer/remote_layer.rs  | 13 ++--
 pageserver/src/tenant/timeline.rs             | 60 +++++++------------
 .../src/tenant/timeline/eviction_task.rs      |  2 +-
 11 files changed, 70 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index d09b661da5..3cb89ab9e9 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -315,7 +315,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
             partition,
             candidate.layer.get_tenant_id(),
             candidate.layer.get_timeline_id(),
-            candidate.layer.filename().file_name(),
+            candidate.layer,
         );
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 190512f48f..964a3ebe3f 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -608,10 +608,7 @@ impl RemoteTimelineClient {
         self.calls_unfinished_metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
 
-        info!(
-            "scheduled layer file upload {}",
-            layer_file_name.file_name()
-        );
+        info!("scheduled layer file upload {layer_file_name}");
 
         // Launch the task immediately, if possible
         self.launch_queued_tasks(upload_queue);
@@ -664,7 +661,7 @@ impl RemoteTimelineClient {
                 });
                 self.calls_unfinished_metric_begin(&op);
                 upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {}", name.file_name());
+                info!("scheduled layer file deletion {name}");
             }
 
             // Launch the tasks immediately, if possible
@@ -828,7 +825,7 @@ impl RemoteTimelineClient {
                     .queued_operations
                     .push_back(op);
 
-                info!("scheduled layer file deletion {}", name.file_name());
+                info!("scheduled layer file deletion {name}");
                 deletions_queued += 1;
             }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 7bc513b3a1..d35eccfc72 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -335,7 +335,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-pub trait Layer: std::fmt::Debug + Send + Sync {
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
     /// Range of keys that this layer covers
     fn get_key_range(&self) -> Range<Key>;
 
@@ -373,9 +373,6 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
         ctx: &RequestContext,
     ) -> Result<ValueReconstructResult>;
 
-    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
-    fn short_id(&self) -> String;
-
     /// Dump summary of the contents of the layer to stdout
     fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }
@@ -512,10 +509,12 @@ pub mod tests {
         fn is_incremental(&self) -> bool {
             self.layer_desc().is_incremental
         }
+    }
 
-        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-        fn short_id(&self) -> String {
-            self.layer_desc().short_id()
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    impl std::fmt::Display for LayerDescriptor {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "{}", self.layer_desc().short_id())
         }
     }
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6e14663121..b57af5ac46 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -394,10 +394,11 @@ impl Layer for DeltaLayer {
     fn is_incremental(&self) -> bool {
         self.layer_desc().is_incremental
     }
-
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
-        self.layer_desc().short_id()
+}
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for DeltaLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs
index 5dcd54689e..073a0588e8 100644
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -210,9 +210,15 @@ pub enum LayerFileName {
 
 impl LayerFileName {
     pub fn file_name(&self) -> String {
+        self.to_string()
+    }
+}
+
+impl fmt::Display for LayerFileName {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            Self::Image(fname) => fname.to_string(),
-            Self::Delta(fname) => fname.to_string(),
+            Self::Image(fname) => write!(f, "{fname}"),
+            Self::Delta(fname) => write!(f, "{fname}"),
         }
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 07a16a7de2..8d31943462 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -230,10 +230,12 @@ impl Layer for ImageLayer {
     fn is_incremental(&self) -> bool {
         self.layer_desc().is_incremental
     }
+}
 
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
-        self.layer_desc().short_id()
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 78bcfdafc0..77778822cf 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -131,13 +131,6 @@ impl Layer for InMemoryLayer {
         true
     }
 
-    fn short_id(&self) -> String {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
-        format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
-    }
-
     /// debugging function to print out the contents of the layer
     fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
         let inner = self.inner.read().unwrap();
@@ -240,6 +233,15 @@ impl Layer for InMemoryLayer {
     }
 }
 
+impl std::fmt::Display for InMemoryLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+    }
+}
+
 impl InMemoryLayer {
     ///
     /// Get layer size on the disk
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index 5ed548909e..a86c037fd8 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,5 @@
 use anyhow::Result;
+use core::fmt::Display;
 use std::ops::Range;
 use utils::{
     id::{TenantId, TimelineId},
@@ -48,8 +49,8 @@ impl PersistentLayerDesc {
         }
     }
 
-    pub fn short_id(&self) -> String {
-        self.filename().file_name()
+    pub fn short_id(&self) -> impl Display {
+        self.filename()
     }
 
     #[cfg(test)]
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index 9d423ed815..5bf16fe6fe 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -71,10 +71,7 @@ impl Layer for RemoteLayer {
         _reconstruct_state: &mut ValueReconstructState,
         _ctx: &RequestContext,
     ) -> Result<ValueReconstructResult> {
-        bail!(
-            "layer {} needs to be downloaded",
-            self.filename().file_name()
-        );
+        bail!("layer {self} needs to be downloaded");
     }
 
     /// debugging function to print out the contents of the layer
@@ -106,10 +103,12 @@ impl Layer for RemoteLayer {
     fn is_incremental(&self) -> bool {
         self.layer_desc().is_incremental
     }
+}
 
-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    fn short_id(&self) -> String {
-        self.layer_desc().short_id()
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
     }
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 39c72a7e47..a977189980 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -128,7 +128,7 @@ impl LayerFileManager {
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
         self.0
             .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.filename().file_name()))
+            .with_context(|| format!("get layer from desc: {}", desc.filename()))
             .expect("not found")
             .clone()
     }
@@ -1381,9 +1381,9 @@ impl Timeline {
                         .read()
                         .unwrap()
                         .observe(delta);
-                    info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period");
+                    info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
                 } else {
-                    info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period");
+                    info!(layer=%local_layer, "evicted layer after unknown residence period");
                 }
 
                 true
@@ -2462,11 +2462,7 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
                 format!("{}", local_path.display())
             }
             None => {
-                format!(
-                    "remote {}/{}",
-                    self.get_timeline_id(),
-                    self.filename().file_name()
-                )
+                format!("remote {}/{self}", self.get_timeline_id())
             }
         }
     }
@@ -2474,11 +2470,7 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
 
 impl TraversalLayerExt for Arc<InMemoryLayer> {
     fn traversal_id(&self) -> TraversalId {
-        format!(
-            "timeline {} in-memory {}",
-            self.get_timeline_id(),
-            self.short_id()
-        )
+        format!("timeline {} in-memory {self}", self.get_timeline_id())
     }
 }
 
@@ -2998,7 +2990,7 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
@@ -3677,7 +3669,7 @@ impl Timeline {
         let remotes = deltas_to_compact
             .iter()
             .filter(|l| l.is_remote_layer())
-            .inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
+            .inspect(|l| info!("compact requires download of {l}"))
             .map(|l| {
                 l.clone()
                     .downcast_remote_layer()
@@ -3701,7 +3693,7 @@ impl Timeline {
         );
 
         for l in deltas_to_compact.iter() {
-            info!("compact includes {}", l.filename().file_name());
+            info!("compact includes {l}");
         }
 
         // We don't need the original list of layers anymore. Drop it so that
@@ -4316,8 +4308,8 @@ impl Timeline {
             if l.get_lsn_range().end > horizon_cutoff {
                 debug!(
                     "keeping {} because it's newer than horizon_cutoff {}",
-                    l.filename().file_name(),
-                    horizon_cutoff
+                    l.filename(),
+                    horizon_cutoff,
                 );
                 result.layers_needed_by_cutoff += 1;
                 continue 'outer;
@@ -4327,8 +4319,8 @@ impl Timeline {
             if l.get_lsn_range().end > pitr_cutoff {
                 debug!(
                     "keeping {} because it's newer than pitr_cutoff {}",
-                    l.filename().file_name(),
-                    pitr_cutoff
+                    l.filename(),
+                    pitr_cutoff,
                 );
                 result.layers_needed_by_pitr += 1;
                 continue 'outer;
@@ -4346,7 +4338,7 @@ impl Timeline {
                 if &l.get_lsn_range().start <= retain_lsn {
                     debug!(
                         "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
-                        l.filename().file_name(),
+                        l.filename(),
                         retain_lsn,
                         l.is_incremental(),
                     );
@@ -4377,10 +4369,7 @@ impl Timeline {
             if !layers
                 .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
             {
-                debug!(
-                    "keeping {} because it is the latest layer",
-                    l.filename().file_name()
-                );
+                debug!("keeping {} because it is the latest layer", l.filename());
                 // Collect delta key ranges that need image layers to allow garbage
                 // collecting the layers.
                 // It is not so obvious whether we need to propagate information only about
@@ -4397,7 +4386,7 @@ impl Timeline {
             // We didn't find any reason to keep this file, so remove it.
             debug!(
                 "garbage collecting {} is_dropped: xx is_incremental: {}",
-                l.filename().file_name(),
+                l.filename(),
                 l.is_incremental(),
             );
             layers_to_remove.push(Arc::clone(&l));
@@ -4551,7 +4540,7 @@ impl Timeline {
     /// If the caller has a deadline or needs a timeout, they can simply stop polling:
     /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
     /// So, the current download attempt will run to completion even if we stop polling.
-    #[instrument(skip_all, fields(layer=%remote_layer.short_id()))]
+    #[instrument(skip_all, fields(layer=%remote_layer))]
     pub async fn download_remote_layer(
         &self,
         remote_layer: Arc<RemoteLayer>,
@@ -4589,7 +4578,7 @@ impl Timeline {
             TaskKind::RemoteDownloadTask,
             Some(self.tenant_id),
             Some(self.timeline_id),
-            &format!("download layer {}", remote_layer.short_id()),
+            &format!("download layer {}", remote_layer),
             false,
             async move {
                 let remote_client = self_clone.remote_client.as_ref().unwrap();
@@ -4865,15 +4854,12 @@ impl Timeline {
                 continue;
             }
 
-            let last_activity_ts = l
-                .access_stats()
-                .latest_activity()
-                .unwrap_or_else(|| {
-                    // We only use this fallback if there's an implementation error.
-                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%l.filename().file_name(), "last_activity returns None, using SystemTime::now");
-                    SystemTime::now()
-                });
+            let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| {
+                // We only use this fallback if there's an implementation error.
+                // `latest_activity` already does rate-limited warn!() log.
+                debug!(layer=%l, "last_activity returns None, using SystemTime::now");
+                SystemTime::now()
+            });
 
             resident_layers.push(LocalLayerInfoForDiskUsageEviction {
                 layer: l,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index f8b10d906f..cd6a2d10cc 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -209,7 +209,7 @@ impl Timeline {
                 let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                     // We only use this fallback if there's an implementation error.
                     // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now");
+                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
                     SystemTime::now()
                 });
 

From 505aa242ace380fe7641935cc3ce98fd6b4c3411 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 5 Jul 2023 14:36:42 +0200
Subject: [PATCH 19/32] page cache: add size metrics (#4629)

Make them a member of `struct PageCache` to prepare for a future
where there's no global state.
---
 pageserver/src/metrics.rs       | 49 +++++++++++++++++++++++++++++++--
 pageserver/src/page_cache.rs    | 44 ++++++++++++++++++++++++++++-
 test_runner/fixtures/metrics.py |  2 ++
 3 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index db5bccdbba..96d23e220f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
     register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
-    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
-    UIntGauge, UIntGaugeVec,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
+    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::models::TenantState;
@@ -203,6 +203,49 @@ pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
     },
 });
 
+pub struct PageCacheSizeMetrics {
+    pub max_bytes: UIntGauge,
+
+    pub current_bytes_ephemeral: UIntGauge,
+    pub current_bytes_immutable: UIntGauge,
+    pub current_bytes_materialized_page: UIntGauge,
+}
+
+static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_page_cache_size_current_bytes",
+        "Current size of the page cache in bytes, by key kind",
+        &["key_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
+    max_bytes: {
+        register_uint_gauge!(
+            "pageserver_page_cache_size_max_bytes",
+            "Maximum size of the page cache in bytes"
+        )
+        .expect("failed to define a metric")
+    },
+
+    current_bytes_ephemeral: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+    current_bytes_immutable: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+    current_bytes_materialized_page: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+});
+
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_wait_lsn_seconds",
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index ef0e748d10..e29eb1d197 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,8 +53,8 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::repository::Key;
 use crate::tenant::writeback_ephemeral_file;
+use crate::{metrics::PageCacheSizeMetrics, repository::Key};
 
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -187,6 +187,8 @@ pub struct PageCache {
     /// Index of the next candidate to evict, for the Clock replacement algorithm.
     /// This is interpreted modulo the page cache size.
     next_evict_slot: AtomicUsize,
+
+    size_metrics: &'static PageCacheSizeMetrics,
 }
 
 ///
@@ -718,6 +720,9 @@ impl PageCache {
 
                     if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
                         versions.remove(version_idx);
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .sub_page_sz(1);
                         if versions.is_empty() {
                             old_entry.remove_entry();
                         }
@@ -730,11 +735,13 @@ impl PageCache {
                 let mut map = self.ephemeral_page_map.write().unwrap();
                 map.remove(&(*file_id, *blkno))
                     .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
             }
             CacheKey::ImmutableFilePage { file_id, blkno } => {
                 let mut map = self.immutable_page_map.write().unwrap();
                 map.remove(&(*file_id, *blkno))
                     .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
             }
         }
     }
@@ -762,6 +769,9 @@ impl PageCache {
                                 slot_idx,
                             },
                         );
+                        self.size_metrics
+                            .current_bytes_materialized_page
+                            .add_page_sz(1);
                         None
                     }
                 }
@@ -772,6 +782,7 @@ impl PageCache {
                     Entry::Occupied(entry) => Some(*entry.get()),
                     Entry::Vacant(entry) => {
                         entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
                         None
                     }
                 }
@@ -782,6 +793,7 @@ impl PageCache {
                     Entry::Occupied(entry) => Some(*entry.get()),
                     Entry::Vacant(entry) => {
                         entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
                         None
                     }
                 }
@@ -881,6 +893,12 @@ impl PageCache {
 
         let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
 
+        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
+        size_metrics.max_bytes.set_page_sz(num_pages);
+        size_metrics.current_bytes_ephemeral.set_page_sz(0);
+        size_metrics.current_bytes_immutable.set_page_sz(0);
+        size_metrics.current_bytes_materialized_page.set_page_sz(0);
+
         let slots = page_buffer
             .chunks_exact_mut(PAGE_SZ)
             .map(|chunk| {
@@ -903,6 +921,30 @@ impl PageCache {
             immutable_page_map: Default::default(),
             slots,
             next_evict_slot: AtomicUsize::new(0),
+            size_metrics,
         }
     }
 }
+
+trait PageSzBytesMetric {
+    fn set_page_sz(&self, count: usize);
+    fn add_page_sz(&self, count: usize);
+    fn sub_page_sz(&self, count: usize);
+}
+
+#[inline(always)]
+fn count_times_page_sz(count: usize) -> u64 {
+    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
+}
+
+impl PageSzBytesMetric for metrics::UIntGauge {
+    fn set_page_sz(&self, count: usize) {
+        self.set(count_times_page_sz(count));
+    }
+    fn add_page_sz(&self, count: usize) {
+        self.add(count_times_page_sz(count));
+    }
+    fn sub_page_sz(&self, count: usize) {
+        self.sub(count_times_page_sz(count));
+    }
+}
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index a2bc2e28e5..3f87aa10a3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -61,6 +61,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_materialized_cache_hits_direct_total",
     "pageserver_page_cache_read_hits_total",
     "pageserver_page_cache_read_accesses_total",
+    "pageserver_page_cache_size_current_bytes",
+    "pageserver_page_cache_size_max_bytes",
     "pageserver_getpage_reconstruct_seconds_bucket",
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",

From 6fb7edf4947adebb2e430f38076ed64def6cc876 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 5 Jul 2023 18:40:25 +0100
Subject: [PATCH 20/32] Compile `pg_embedding` extension (#4634)

```
CREATE EXTENSION embedding;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);

SELECT * FROM t ORDER BY val <-> array[3,3,3];
   val
---------
 {1,2,3}
 {1,2,4}
 {1,1,1}
 {0,0,0}

(5 rows)
```
---
 Dockerfile.compute-node | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 310e4c32a3..7208024d63 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,6 +515,25 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-embedding-pg-build"
+# compile pg_embedding extension
+#
+#########################################################################################
+FROM build-deps AS pg-embedding-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
+    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -671,6 +690,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 434eaadbe394898d2cbef0f534fa7117f3c6e70d Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 6 Jul 2023 00:15:55 +0300
Subject: [PATCH 21/32] Move uninitialized timeline from tenant.rs to
 timeline/uninit.rs

---
 pageserver/src/tenant.rs                 | 209 +--------------------
 pageserver/src/tenant/timeline.rs        |   1 +
 pageserver/src/tenant/timeline/uninit.rs | 219 +++++++++++++++++++++++
 3 files changed, 229 insertions(+), 200 deletions(-)
 create mode 100644 pageserver/src/tenant/timeline/uninit.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 3fa2a4bab4..a4ea6b0035 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11,7 +11,7 @@
 //! parent timeline, and the last LSN that has been written to disk.
 //!
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
@@ -49,6 +49,8 @@ use std::time::{Duration, Instant};
 use self::config::TenantConf;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::uninit::TimelineUninitMark;
+use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -68,6 +70,7 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;
 
+use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
@@ -161,200 +164,6 @@ pub struct Tenant {
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 }
 
-/// A timeline with some of its files on disk, being initialized.
-/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
-///
-/// The caller is responsible for proper timeline data filling before the final init.
-#[must_use]
-pub struct UninitializedTimeline<'t> {
-    owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-}
-
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
-#[must_use]
-struct TimelineUninitMark {
-    uninit_mark_deleted: bool,
-    uninit_mark_path: PathBuf,
-    timeline_path: PathBuf,
-}
-
-impl UninitializedTimeline<'_> {
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
-    ///
-    /// This function launches the flush loop if not already done.
-    ///
-    /// The caller is responsible for activating the timeline (function `.activate()`).
-    fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
-        let timeline_id = self.timeline_id;
-        let tenant_id = self.owning_tenant.tenant_id;
-
-        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
-            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
-        })?;
-
-        // Check that the caller initialized disk_consistent_lsn
-        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
-        ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
-
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
-                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
-            ),
-            Entry::Vacant(v) => {
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
-                    )
-                })?;
-                v.insert(Arc::clone(&new_timeline));
-
-                new_timeline.maybe_spawn_flush_loop();
-            }
-        }
-
-        Ok(new_timeline)
-    }
-
-    /// Prepares timeline data by loading it from the basebackup archive.
-    pub async fn import_basebackup_from_tar(
-        self,
-        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
-        base_lsn: Lsn,
-        broker_client: storage_broker::BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
-
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
-
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
-        let tl = self.finish_creation()?;
-        tl.activate(broker_client, None, ctx);
-        Ok(tl)
-    }
-
-    fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
-        Ok(&self
-            .raw_timeline
-            .as_ref()
-            .with_context(|| {
-                format!(
-                    "No raw timeline {}/{} found",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            })?
-            .0)
-    }
-}
-
-impl Drop for UninitializedTimeline<'_> {
-    fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
-            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
-            error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
-        }
-    }
-}
-
-fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
-        Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
-        }
-        Err(e) => {
-            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
-        }
-    }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
-}
-
-impl TimelineUninitMark {
-    fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
-        Self {
-            uninit_mark_deleted: false,
-            uninit_mark_path,
-            timeline_path,
-        }
-    }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
-}
-
-impl Drop for TimelineUninitMark {
-    fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path.display(),
-                    self.timeline_path.display()
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path.display()
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-    }
-}
-
 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
 //     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
@@ -3012,11 +2821,11 @@ impl Tenant {
 
         debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}");
 
-        Ok(UninitializedTimeline {
-            owning_tenant: self,
-            timeline_id: new_timeline_id,
-            raw_timeline: Some((timeline_struct, uninit_mark)),
-        })
+        Ok(UninitializedTimeline::new(
+            self,
+            new_timeline_id,
+            Some((timeline_struct, uninit_mark)),
+        ))
     }
 
     fn create_timeline_files(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a977189980..577ba5d867 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,6 +1,7 @@
 //!
 
 mod eviction_task;
+pub mod uninit;
 mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
new file mode 100644
index 0000000000..27d43a8b24
--- /dev/null
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -0,0 +1,219 @@
+use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
+
+use anyhow::Context;
+use tracing::{error, info, info_span, warn};
+use utils::{crashsafe, id::TimelineId, lsn::Lsn};
+
+use crate::{
+    context::RequestContext,
+    import_datadir,
+    tenant::{ignore_absent_files, Tenant},
+};
+
+use super::Timeline;
+
+/// A timeline with some of its files on disk, being initialized.
+/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
+/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
+/// to be removed on next restart.
+///
+/// The caller is responsible for proper timeline data filling before the final init.
+#[must_use]
+pub struct UninitializedTimeline<'t> {
+    pub(crate) owning_tenant: &'t Tenant,
+    timeline_id: TimelineId,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+}
+
+impl<'t> UninitializedTimeline<'t> {
+    pub(crate) fn new(
+        owning_tenant: &'t Tenant,
+        timeline_id: TimelineId,
+        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+    ) -> Self {
+        Self {
+            owning_tenant,
+            timeline_id,
+            raw_timeline,
+        }
+    }
+
+    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
+    /// uninit mark file.
+    ///
+    /// This function launches the flush loop if not already done.
+    ///
+    /// The caller is responsible for activating the timeline (function `.activate()`).
+    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+        let timeline_id = self.timeline_id;
+        let tenant_id = self.owning_tenant.tenant_id;
+
+        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
+            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
+        })?;
+
+        // Check that the caller initialized disk_consistent_lsn
+        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
+        anyhow::ensure!(
+            new_disk_consistent_lsn.is_valid(),
+            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
+        );
+
+        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
+        match timelines.entry(timeline_id) {
+            Entry::Occupied(_) => anyhow::bail!(
+                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
+            ),
+            Entry::Vacant(v) => {
+                uninit_mark.remove_uninit_mark().with_context(|| {
+                    format!(
+                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
+                    )
+                })?;
+                v.insert(Arc::clone(&new_timeline));
+
+                new_timeline.maybe_spawn_flush_loop();
+            }
+        }
+
+        Ok(new_timeline)
+    }
+
+    /// Prepares timeline data by loading it from the basebackup archive.
+    pub(crate) async fn import_basebackup_from_tar(
+        self,
+        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
+        base_lsn: Lsn,
+        broker_client: storage_broker::BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let raw_timeline = self.raw_timeline()?;
+
+        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
+            .await
+            .context("Failed to import basebackup")?;
+
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        raw_timeline.maybe_spawn_flush_loop();
+
+        fail::fail_point!("before-checkpoint-new-timeline", |_| {
+            anyhow::bail!("failpoint before-checkpoint-new-timeline");
+        });
+
+        raw_timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after basebackup import")?;
+
+        // All the data has been imported. Insert the Timeline into the tenant's timelines
+        // map and remove the uninit mark file.
+        let tl = self.finish_creation()?;
+        tl.activate(broker_client, None, ctx);
+        Ok(tl)
+    }
+
+    pub(crate) fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
+        Ok(&self
+            .raw_timeline
+            .as_ref()
+            .with_context(|| {
+                format!(
+                    "No raw timeline {}/{} found",
+                    self.owning_tenant.tenant_id, self.timeline_id
+                )
+            })?
+            .0)
+    }
+}
+
+impl Drop for UninitializedTimeline<'_> {
+    fn drop(&mut self) {
+        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
+            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
+            error!("Timeline got dropped without initializing, cleaning its files");
+            cleanup_timeline_directory(uninit_mark);
+        }
+    }
+}
+
+pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
+    let timeline_path = &uninit_mark.timeline_path;
+    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+        Ok(()) => {
+            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
+        }
+        Err(e) => {
+            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
+        }
+    }
+    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
+}
+
+/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
+/// or gets removed eventually.
+///
+/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
+#[must_use]
+pub(crate) struct TimelineUninitMark {
+    uninit_mark_deleted: bool,
+    uninit_mark_path: PathBuf,
+    pub(crate) timeline_path: PathBuf,
+}
+
+impl TimelineUninitMark {
+    pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
+        Self {
+            uninit_mark_deleted: false,
+            uninit_mark_path,
+            timeline_path,
+        }
+    }
+
+    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
+        if !self.uninit_mark_deleted {
+            self.delete_mark_file_if_present()?;
+        }
+
+        Ok(())
+    }
+
+    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
+        let uninit_mark_file = &self.uninit_mark_path;
+        let uninit_mark_parent = uninit_mark_file
+            .parent()
+            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
+        })?;
+        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
+        self.uninit_mark_deleted = true;
+
+        Ok(())
+    }
+}
+
+impl Drop for TimelineUninitMark {
+    fn drop(&mut self) {
+        if !self.uninit_mark_deleted {
+            if self.timeline_path.exists() {
+                error!(
+                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
+                    self.uninit_mark_path.display(),
+                    self.timeline_path.display()
+                )
+            } else {
+                // unblock later timeline creation attempts
+                warn!(
+                    "Removing intermediate uninit mark file {}",
+                    self.uninit_mark_path.display()
+                );
+                if let Err(e) = self.delete_mark_file_if_present() {
+                    error!("Failed to remove the uninit mark file: {e}")
+                }
+            }
+        }
+    }
+}

From e418fc6dc3e1617b9f2a130962d77fc63ceb8be9 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 6 Jul 2023 00:31:55 +0300
Subject: [PATCH 22/32] move some tracing related assertions to separate module
 for tenant and timeline

---
 pageserver/src/tenant.rs                      | 44 +++++--------------
 .../tenant/remote_timeline_client/download.rs |  2 +-
 pageserver/src/tenant/span.rs                 | 24 ++++++++++
 pageserver/src/tenant/timeline.rs             | 32 ++------------
 pageserver/src/tenant/timeline/span.rs        | 26 +++++++++++
 5 files changed, 64 insertions(+), 64 deletions(-)
 create mode 100644 pageserver/src/tenant/span.rs
 create mode 100644 pageserver/src/tenant/timeline/span.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a4ea6b0035..1650b267f1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -90,6 +90,7 @@ pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
 pub mod manifest;
+mod span;
 
 pub mod metadata;
 mod par_fsync;
@@ -105,7 +106,7 @@ mod timeline;
 
 pub mod size;
 
-pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub use timeline::{
     LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
 };
@@ -504,7 +505,7 @@ impl Tenant {
     /// No background tasks are started as part of this routine.
     ///
     async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
 
         let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
         if !tokio::fs::try_exists(&marker_file)
@@ -642,7 +643,7 @@ impl Tenant {
         remote_client: RemoteTimelineClient,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
 
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -721,7 +722,7 @@ impl Tenant {
         init_order: Option<InitializationOrder>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
 
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
@@ -907,7 +908,7 @@ impl Tenant {
         init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
 
         debug!("loading tenant task");
 
@@ -953,7 +954,7 @@ impl Tenant {
         init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
 
         let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
             RemoteTimelineClient::new(
@@ -1544,7 +1545,7 @@ impl Tenant {
         timeline_id: TimelineId,
         _ctx: &RequestContext,
     ) -> Result<(), DeleteTimelineError> {
-        timeline::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();
 
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
@@ -1708,7 +1709,7 @@ impl Tenant {
         background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
 
         let mut activating = false;
         self.state.send_modify(|current_state| {
@@ -1779,7 +1780,7 @@ impl Tenant {
     ///
     /// This will attempt to shutdown even if tenant is broken.
     pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
         // Set tenant (and its timlines) to Stoppping state.
         //
         // Since we can only transition into Stopping state after activation is complete,
@@ -4380,28 +4381,3 @@ mod tests {
         Ok(())
     }
 }
-
-#[cfg(not(debug_assertions))]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {}
-
-#[cfg(debug_assertions)]
-pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
-    utils::tracing_span_assert::MultiNameExtractor<2>,
-> = once_cell::sync::Lazy::new(|| {
-    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
-});
-
-#[cfg(debug_assertions)]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    use utils::tracing_span_assert;
-
-    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
-        Ok(()) => (),
-        Err(missing) => panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        ),
-    }
-}
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index a0d8c0193a..87b1026a76 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,7 +16,7 @@ use tracing::{info, warn};
 
 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs
new file mode 100644
index 0000000000..1ee8aac31e
--- /dev/null
+++ b/pageserver/src/tenant/span.rs
@@ -0,0 +1,24 @@
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
+    utils::tracing_span_assert::MultiNameExtractor<2>,
+> = once_cell::sync::Lazy::new(|| {
+    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
+});
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    use utils::tracing_span_assert;
+
+    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 577ba5d867..7a8d4ced6f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,6 +1,7 @@
 //!
 
 mod eviction_task;
+pub mod span;
 pub mod uninit;
 mod walreceiver;
 
@@ -2240,7 +2241,7 @@ impl Timeline {
         ctx: &RequestContext,
         cancel: CancellationToken,
     ) -> Result<u64, CalculateLogicalSizeError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
 
         let mut timeline_state_updates = self.subscribe_for_state_updates();
         let self_calculation = Arc::clone(self);
@@ -4546,7 +4547,7 @@ impl Timeline {
         &self,
         remote_layer: Arc<RemoteLayer>,
     ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
 
         use std::sync::atomic::Ordering::Relaxed;
 
@@ -4980,33 +4981,6 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
     bail!("couldn't find an unused backup number for {:?}", path)
 }
 
-#[cfg(not(debug_assertions))]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
-
-#[cfg(debug_assertions)]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    use utils::tracing_span_assert;
-
-    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
-        tracing_span_assert::MultiNameExtractor<2>,
-    > = once_cell::sync::Lazy::new(|| {
-        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
-    });
-
-    match tracing_span_assert::check_fields_present([
-        &*super::TENANT_ID_EXTRACTOR,
-        &*TIMELINE_ID_EXTRACTOR,
-    ]) {
-        Ok(()) => (),
-        Err(missing) => panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        ),
-    }
-}
-
 /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
 ///
 /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs
new file mode 100644
index 0000000000..3cd405dc31
--- /dev/null
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -0,0 +1,26 @@
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    use utils::tracing_span_assert;
+
+    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+    });
+
+    match tracing_span_assert::check_fields_present([
+        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}

From b2635108669f75f3d77f8bb920f44e39c450e4b8 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 6 Jul 2023 00:47:04 +0300
Subject: [PATCH 23/32] move some logical size bits to separate logical_size.rs

---
 pageserver/src/tenant/timeline.rs             | 128 +-----------------
 .../src/tenant/timeline/logical_size.rs       | 128 ++++++++++++++++++
 2 files changed, 133 insertions(+), 123 deletions(-)
 create mode 100644 pageserver/src/tenant/timeline/logical_size.rs

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7a8d4ced6f..4a80708b4a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,6 +1,7 @@
 //!
 
 mod eviction_task;
+mod logical_size;
 pub mod span;
 pub mod uninit;
 mod walreceiver;
@@ -10,7 +11,6 @@ use bytes::Bytes;
 use fail::fail_point;
 use futures::StreamExt;
 use itertools::Itertools;
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
     DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus,
@@ -19,7 +19,7 @@ use pageserver_api::models::{
 use remote_storage::GenericRemoteStorage;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
-use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
+use tokio::sync::{oneshot, watch, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantTimelineId;
@@ -30,7 +30,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::pin::pin;
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
@@ -40,6 +40,7 @@ use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
     LayerAccessStats, LayerFileName, RemoteLayer,
 };
+use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     ephemeral_file::is_ephemeral_file,
     layer_map::{LayerMap, SearchResult},
@@ -81,6 +82,7 @@ use crate::{is_temporary, task_mgr};
 
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
+use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::config::TenantConf;
@@ -367,126 +369,6 @@ pub struct Timeline {
     initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
 }
 
-/// Internal structure to hold all data needed for logical size calculation.
-///
-/// Calculation consists of two stages:
-///
-/// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
-///
-/// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
-struct LogicalSize {
-    /// Size, potentially slow to compute. Calculating this might require reading multiple
-    /// layers, and even ancestor's layers.
-    ///
-    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
-    /// the initial size at a different LSN.
-    initial_logical_size: OnceCell<u64>,
-
-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
-    initial_size_computation: Arc<tokio::sync::Semaphore>,
-
-    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
-    initial_part_end: Option<Lsn>,
-
-    /// All other size changes after startup, combined together.
-    ///
-    /// Size shouldn't ever be negative, but this is signed for two reasons:
-    ///
-    /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
-    ///
-    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
-    ///
-    /// Note that we also expose a copy of this value as a prometheus metric,
-    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
-    /// to modify this, it will also keep the prometheus metric in sync.
-    size_added_after_initial: AtomicI64,
-}
-
-/// Normalized current size, that the data in pageserver occupies.
-#[derive(Debug, Clone, Copy)]
-enum CurrentLogicalSize {
-    /// The size is not yet calculated to the end, this is an intermediate result,
-    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
-    /// yet total logical size cannot be below 0.
-    Approximate(u64),
-    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
-    // available for observation without any calculations.
-    Exact(u64),
-}
-
-impl CurrentLogicalSize {
-    fn size(&self) -> u64 {
-        *match self {
-            Self::Approximate(size) => size,
-            Self::Exact(size) => size,
-        }
-    }
-}
-
-impl LogicalSize {
-    fn empty_initial() -> Self {
-        Self {
-            initial_logical_size: OnceCell::with_value(0),
-            //  initial_logical_size already computed, so, don't admit any calculations
-            initial_size_computation: Arc::new(Semaphore::new(0)),
-            initial_part_end: None,
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    fn deferred_initial(compute_to: Lsn) -> Self {
-        Self {
-            initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
-            initial_part_end: Some(compute_to),
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
-        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
-        //                  ^^^ keep this type explicit so that the casts in this function break if
-        //                  we change the type.
-        match self.initial_logical_size.get() {
-            Some(initial_size) => {
-                initial_size.checked_add_signed(size_increment)
-                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
-            }
-            None => {
-                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
-            }
-        }
-    }
-
-    fn increment_size(&self, delta: i64) {
-        self.size_added_after_initial
-            .fetch_add(delta, AtomicOrdering::SeqCst);
-    }
-
-    /// Make the value computed by initial logical size computation
-    /// available for re-use. This doesn't contain the incremental part.
-    fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
-        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
-            _ => None,
-        }
-    }
-}
-
 pub struct WalReceiverInfo {
     pub wal_source_connconf: PgConnectionConfig,
     pub last_received_msg_lsn: Lsn,
diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs
new file mode 100644
index 0000000000..d9c2bc4cb9
--- /dev/null
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -0,0 +1,128 @@
+use anyhow::Context;
+use once_cell::sync::OnceCell;
+
+use tokio::sync::Semaphore;
+use utils::lsn::Lsn;
+
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::Arc;
+
+/// Internal structure to hold all data needed for logical size calculation.
+///
+/// Calculation consists of two stages:
+///
+/// 1. Initial size calculation. That might take a long time, because it requires
+/// reading all layers containing relation sizes at `initial_part_end`.
+///
+/// 2. Collecting an incremental part and adding that to the initial size.
+/// Increments are appended on walreceiver writing new timeline data,
+/// which result in increase or decrease of the logical size.
+pub(super) struct LogicalSize {
+    /// Size, potentially slow to compute. Calculating this might require reading multiple
+    /// layers, and even ancestor's layers.
+    ///
+    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
+    /// the initial size at a different LSN.
+    pub initial_logical_size: OnceCell<u64>,
+
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
+
+    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
+    pub initial_part_end: Option<Lsn>,
+
+    /// All other size changes after startup, combined together.
+    ///
+    /// Size shouldn't ever be negative, but this is signed for two reasons:
+    ///
+    /// 1. If we initialized the "baseline" size lazily, while we already
+    /// process incoming WAL, the incoming WAL records could decrement the
+    /// variable and temporarily make it negative. (This is just future-proofing;
+    /// the initialization is currently not done lazily.)
+    ///
+    /// 2. If there is a bug and we e.g. forget to increment it in some cases
+    /// when size grows, but remember to decrement it when it shrinks again, the
+    /// variable could go negative. In that case, it seems better to at least
+    /// try to keep tracking it, rather than clamp or overflow it. Note that
+    /// get_current_logical_size() will clamp the returned value to zero if it's
+    /// negative, and log an error. Could set it permanently to zero or some
+    /// special value to indicate "broken" instead, but this will do for now.
+    ///
+    /// Note that we also expose a copy of this value as a prometheus metric,
+    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
+    /// to modify this, it will also keep the prometheus metric in sync.
+    pub size_added_after_initial: AtomicI64,
+}
+
+/// Normalized current size, that the data in pageserver occupies.
+#[derive(Debug, Clone, Copy)]
+pub(super) enum CurrentLogicalSize {
+    /// The size is not yet calculated to the end, this is an intermediate result,
+    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
+    /// yet total logical size cannot be below 0.
+    Approximate(u64),
+    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
+    // available for observation without any calculations.
+    Exact(u64),
+}
+
+impl CurrentLogicalSize {
+    pub(super) fn size(&self) -> u64 {
+        *match self {
+            Self::Approximate(size) => size,
+            Self::Exact(size) => size,
+        }
+    }
+}
+
+impl LogicalSize {
+    pub(super) fn empty_initial() -> Self {
+        Self {
+            initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
+            initial_part_end: None,
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
+        Self {
+            initial_logical_size: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
+            initial_part_end: Some(compute_to),
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
+        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
+        //                  ^^^ keep this type explicit so that the casts in this function break if
+        //                  we change the type.
+        match self.initial_logical_size.get() {
+            Some(initial_size) => {
+                initial_size.checked_add_signed(size_increment)
+                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
+                    .map(CurrentLogicalSize::Exact)
+            }
+            None => {
+                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
+                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
+            }
+        }
+    }
+
+    pub(super) fn increment_size(&self, delta: i64) {
+        self.size_added_after_initial
+            .fetch_add(delta, AtomicOrdering::SeqCst);
+    }
+
+    /// Make the value computed by initial logical size computation
+    /// available for re-use. This doesn't contain the incremental part.
+    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
+        match self.initial_part_end {
+            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            _ => None,
+        }
+    }
+}

From 91435006bda021d1460bd47e4b1b398321dffe75 Mon Sep 17 00:00:00 2001
From: Tomoka Hayashi <b1016016@gmail.com>
Date: Thu, 6 Jul 2023 18:11:24 +0900
Subject: [PATCH 24/32] Fix docker-compose file and document (#4621)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

- Running the command according to docker.md gives warning and error.
- Warning `permissions should be u=rw (0600) or less` is output when
executing `psql -h localhost -p 55433 -U cloud_admin`.
- `FATAL: password authentication failed for user "root”` is output in
compute logs.

## Summary of changes

- Add `$ chmod 600 ~/.pgpass` in docker.md to avoid warning.
- Add username (cloud_admin) to pg_isready command in docker-compose.yml
to avoid error.

---------

Co-authored-by: Tomoka Hayashi <tomoka.hayashi@ntt.com>
---
 docker-compose/docker-compose.yml | 2 +-
 docs/docker.md                    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 4926dad932..9777d1fdd2 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -189,7 +189,7 @@ services:
       - "/bin/bash"
       - "-c"
     command:
-      - "until pg_isready -h compute -p 55433 ; do
+      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
             echo 'Waiting to start compute...' && sleep 1;
          done"
     depends_on:
diff --git a/docs/docker.md b/docs/docker.md
index 704044377f..9761cc4346 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -48,6 +48,7 @@ Creating docker-compose_storage_broker_1       ... done
 2. connect compute node
 ```
 $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ chmod 600 ~/.pgpass
 $ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE

From 269e20aeab677f9ce672bece8441b10eaeb29c9d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 6 Jul 2023 12:32:34 +0300
Subject: [PATCH 25/32] fix: filter out zero synthetic sizes (#4639)

Apparently sending the synthetic_size == 0 is causing problems, so
filter out sending zeros.

Slack discussion:
https://neondb.slack.com/archives/C03F5SM1N02/p1688574285718989?thread_ts=1687874910.681049&cid=C03F5SM1N02
---
 pageserver/src/consumption_metrics.rs | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f7592afc12..df44300fce 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -234,14 +234,18 @@ pub async fn collect_metrics_iteration(
         // Note that this metric is calculated in a separate bgworker
         // Here we only use cached value, which may lag behind the real latest one
         let tenant_synthetic_size = tenant.get_cached_synthetic_size();
-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: SYNTHETIC_STORAGE_SIZE,
-            },
-            tenant_synthetic_size,
-        ));
+
+        if tenant_synthetic_size != 0 {
+            // only send non-zeroes because otherwise these show up as errors in logs
+            current_metrics.push((
+                PageserverConsumptionMetricsKey {
+                    tenant_id,
+                    timeline_id: None,
+                    metric: SYNTHETIC_STORAGE_SIZE,
+                },
+                tenant_synthetic_size,
+            ));
+        }
     }
 
     // Filter metrics, unless we want to send all metrics, including cached ones.

From 1741edf933574c2ed1078aaf2474622ea8bd13b4 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 6 Jul 2023 12:47:43 +0300
Subject: [PATCH 26/32] Pageserver reconnect v2 (#4519)

## Problem

Compute is not always able to reconnect to pages server.
First of all it caused by long time of restart of pageserver.
So number of attempts is increased from 5 (hardcoded) to 60 (GUC).
Also we do not perform flush after each command to increase performance
(it is especially critical for prefetch).
Unfortunately such pending flush makes it not possible to transparently
reconnect to restarted pageserver.
What we can do is to try to minimzie such probabilty.
Most likely broken connection will be detected in first sens command
after some idle period.
This is why max_flush_delay parameter is added which force flush to be
performed after first request after some idle period.

See #4497

## Summary of changes

Add neon.max_reconnect_attempts and neon.max_glush_delay GUCs which
contol when flush has to be done
and when it is possible to try to reconnect to page server


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/neon/libpagestore.c     | 117 +++++++++++++++++++----------------
 pgxn/neon/pagestore_client.h |   4 +-
 pgxn/neon/pagestore_smgr.c   |  42 +++++++------
 3 files changed, 88 insertions(+), 75 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 606af9741f..4fdc7f8c82 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -34,7 +34,6 @@
 
 #define PageStoreTrace DEBUG5
 
-#define MAX_RECONNECT_ATTEMPTS 5
 #define RECONNECT_INTERVAL_USEC 1000000
 
 bool		connected = false;
@@ -55,13 +54,15 @@ int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;
 
-int			n_unflushed_requests = 0;
-int			flush_every_n_requests = 8;
 int			readahead_buffer_size = 128;
+int			flush_every_n_requests = 8;
+
+int			n_reconnect_attempts = 0;
+int			max_reconnect_attempts = 60;
 
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
 
-static void pageserver_flush(void);
+static bool pageserver_flush(void);
 
 static bool
 pageserver_connect(int elevel)
@@ -232,16 +233,17 @@ pageserver_disconnect(void)
 	}
 }
 
-static void
+static bool
 pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;
-	int n_reconnect_attempts = 0;
 
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		neon_log(LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect();
-
+	}
 
 	req_buff = nm_pack_request(request);
 
@@ -252,53 +254,36 @@ pageserver_send(NeonRequest * request)
 	 * See https://github.com/neondatabase/neon/issues/1138
 	 * So try to reestablish connection in case of failure.
 	 */
-	while (true)
+	if (!connected)
 	{
-		if (!connected)
+		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
-			if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR))
-			{
-				n_reconnect_attempts += 1;
-				pg_usleep(RECONNECT_INTERVAL_USEC);
-				continue;
-			}
+			n_reconnect_attempts += 1;
+			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
+		n_reconnect_attempts = 0;
+	}
 
-		/*
-		 * Send request.
-		 *
-		 * In principle, this could block if the output buffer is full, and we
-		 * should use async mode and check for interrupts while waiting. In
-		 * practice, our requests are small enough to always fit in the output and
-		 * TCP buffer.
-		 */
-		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-			if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS)
-			{
-				neon_log(LOG, "failed to send page request (try to reconnect): %s", msg);
-				if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */
-					pg_usleep(RECONNECT_INTERVAL_USEC);
-				n_reconnect_attempts += 1;
-				continue;
-			}
-			else
-			{
-				pageserver_disconnect();
-				neon_log(ERROR, "failed to send page request: %s", msg);
-			}
-		}
-		break;
+	/*
+	 * Send request.
+	 *
+	 * In principle, this could block if the output buffer is full, and we
+	 * should use async mode and check for interrupts while waiting. In
+	 * practice, our requests are small enough to always fit in the output and
+	 * TCP buffer.
+	 */
+	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		pageserver_disconnect();
+		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		pfree(msg);
+		pfree(req_buff.data);
+		return false;
 	}
 
 	pfree(req_buff.data);
 
-	n_unflushed_requests++;
-
-	if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
-		pageserver_flush();
-
 	if (message_level_is_interesting(PageStoreTrace))
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);
@@ -306,6 +291,7 @@ pageserver_send(NeonRequest * request)
 		neon_log(PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
+	return true;
 }
 
 static NeonResponse *
@@ -340,16 +326,25 @@ pageserver_receive(void)
 		}
 		else if (rc == -1)
 		{
+			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
 			pageserver_disconnect();
 			resp = NULL;
 		}
 		else if (rc == -2)
-			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
+		{
+			char* msg = pchomp(PQerrorMessage(pageserver_conn));
+			pageserver_disconnect();
+			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+		}
 		else
-			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
+		{
+			pageserver_disconnect();
+			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+		}
 	}
 	PG_CATCH();
 	{
+		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
 		pageserver_disconnect();
 		PG_RE_THROW();
 	}
@@ -359,21 +354,25 @@ pageserver_receive(void)
 }
 
 
-static void
+static bool
 pageserver_flush(void)
 {
 	if (!connected)
 	{
 		neon_log(WARNING, "Tried to flush while disconnected");
 	}
-	else if (PQflush(pageserver_conn))
+	else
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
-		pageserver_disconnect();
-		neon_log(ERROR, "failed to flush page requests: %s", msg);
+		if (PQflush(pageserver_conn))
+		{
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+			pageserver_disconnect();
+			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			pfree(msg);
+			return false;
+		}
 	}
-	n_unflushed_requests = 0;
+	return true;
 }
 
 page_server_api api = {
@@ -439,6 +438,14 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, NULL, NULL);
+	DefineCustomIntVariable("neon.max_reconnect_attempts",
+							"Maximal attempts to reconnect to pages server (with 1 second timeout)",
+							NULL,
+							&max_reconnect_attempts,
+							10, 0, INT_MAX,
+							PGC_USERSET,
+							0,
+							NULL, NULL, NULL);
 	DefineCustomIntVariable("neon.readahead_buffer_size",
 							"number of prefetches to buffer",
 							"This buffer is used to hold and manage prefetched "
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 8257b90ac3..2889db49bc 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -145,9 +145,9 @@ extern char *nm_to_string(NeonMessage * msg);
 
 typedef struct
 {
-	void		(*send) (NeonRequest * request);
+	bool		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
-	void		(*flush) (void);
+	bool		(*flush) (void);
 }			page_server_api;
 
 extern void prefetch_on_ps_disconnect(void);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 79dec0881d..76d71dd94b 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -489,7 +489,8 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		page_server->flush();
+		if (!page_server->flush())
+			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
 
@@ -666,7 +667,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		 * smaller than the current WAL insert/redo pointer, which is already
 		 * larger than this prefetch_lsn. So in any case, that would
 		 * invalidate this cache.
-		 * 
+		 *
 		 * The best LSN to use for effective_request_lsn would be
 		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
 		 */
@@ -677,7 +678,8 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
-	page_server->send((NeonRequest *) &request);
+
+	while (!page_server->send((NeonRequest *) &request));
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -687,6 +689,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
 
+
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -743,6 +746,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
+
 			}
 			/* if we don't want the latest version, only accept requests with the exact same LSN */
 			else
@@ -756,20 +760,23 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 			}
 		}
 
-		/*
-		 * We received a prefetch for a page that was recently read and
-		 * removed from the buffers. Remove that request from the buffers.
-		 */
-		else if (slot->status == PRFS_TAG_REMAINS)
+		if (entry != NULL)
 		{
-			prefetch_set_unused(ring_index);
-			entry = NULL;
-		}
-		else
-		{
-			/* The buffered request is good enough, return that index */
-			pgBufferUsage.prefetch.duplicates++;
-			return ring_index;
+			/*
+			 * We received a prefetch for a page that was recently read and
+			 * removed from the buffers. Remove that request from the buffers.
+			 */
+			if (slot->status == PRFS_TAG_REMAINS)
+			{
+				prefetch_set_unused(ring_index);
+				entry = NULL;
+			}
+			else
+			{
+				/* The buffered request is good enough, return that index */
+				pgBufferUsage.prefetch.duplicates++;
+				return ring_index;
+			}
 		}
 	}
 
@@ -859,8 +866,7 @@ page_server_request(void const *req)
 {
 	NeonResponse* resp;
 	do {
-		page_server->send((NeonRequest *) req);
-		page_server->flush();
+		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
 		resp = page_server->receive();

From d340cf3721f0adb8f309f115d86986b2230e4537 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 6 Jul 2023 11:21:45 -0400
Subject: [PATCH 27/32] dump more info in layer map (#4567)

A simple commit extracted from
https://github.com/neondatabase/neon/pull/4539

This PR adds more info for layer dumps (is_delta, is_incremental, size).

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer/delta_layer.rs  | 5 +++--
 pageserver/src/tenant/storage_layer/image_layer.rs  | 6 ++++--
 pageserver/src/tenant/storage_layer/layer_desc.rs   | 7 +++++--
 pageserver/src/tenant/storage_layer/remote_layer.rs | 7 +++++--
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index b57af5ac46..2591beef3e 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -222,13 +222,14 @@ impl Layer for DeltaLayer {
     /// debugging function to print out the contents of the layer
     fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
             self.desc.tenant_id,
             self.desc.timeline_id,
             self.desc.key_range.start,
             self.desc.key_range.end,
             self.desc.lsn_range.start,
-            self.desc.lsn_range.end
+            self.desc.lsn_range.end,
+            self.desc.file_size,
         );
 
         if !verbose {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 8d31943462..30a7d2f179 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -153,12 +153,14 @@ impl Layer for ImageLayer {
     /// debugging function to print out the contents of the layer
     fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} ----",
+            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
             self.desc.tenant_id,
             self.desc.timeline_id,
             self.desc.key_range.start,
             self.desc.key_range.end,
-            self.lsn
+            self.lsn,
+            self.desc.is_incremental,
+            self.desc.file_size
         );
 
         if !verbose {
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index a86c037fd8..42c3925b73 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -174,13 +174,16 @@ impl PersistentLayerDesc {
 
     pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
         println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
             self.tenant_id,
             self.timeline_id,
             self.key_range.start,
             self.key_range.end,
             self.lsn_range.start,
-            self.lsn_range.end
+            self.lsn_range.end,
+            self.is_delta,
+            self.is_incremental,
+            self.file_size,
         );
 
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index 5bf16fe6fe..14975629a9 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -77,13 +77,16 @@ impl Layer for RemoteLayer {
     /// debugging function to print out the contents of the layer
     fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
         println!(
-            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
             self.desc.tenant_id,
             self.desc.timeline_id,
             self.desc.key_range.start,
             self.desc.key_range.end,
             self.desc.lsn_range.start,
-            self.desc.lsn_range.end
+            self.desc.lsn_range.end,
+            self.desc.is_delta,
+            self.desc.is_incremental,
+            self.desc.file_size,
         );
 
         Ok(())

From 634be4f4e06f67444524aca33c1f1b5b29b8d16c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 6 Jul 2023 18:11:43 +0400
Subject: [PATCH 28/32] Fix async write in safekeepers.

General Rust Write trait semantics (as well as its async brother) is that write
definitely happens only after Write::flush(). This wasn't needed in sync where
rust write calls the syscall directly, but is required in async.

Also fix setting initial end_pos in walsender, sometimes it was from the future.

fixes https://github.com/neondatabase/neon/issues/4518
---
 safekeeper/src/control_file.rs  |  6 ++++++
 safekeeper/src/pull_timeline.rs |  1 +
 safekeeper/src/send_wal.rs      | 14 +++++++++-----
 safekeeper/src/wal_storage.rs   |  5 +++++
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 6c4ad24323..653e938bb7 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -191,6 +191,12 @@ impl Storage for FileStorage {
                 control_partial_path.display()
             )
         })?;
+        control_partial.flush().await.with_context(|| {
+            format!(
+                "failed to flush safekeeper state into control file at: {}",
+                control_partial_path.display()
+            )
+        })?;
 
         // fsync the file
         if !self.conf.no_sync {
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 61ba37efaa..8d8ef6192c 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -188,6 +188,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
         let mut response = client.get(&http_url).send().await?;
         while let Some(chunk) = response.chunk().await? {
             file.write_all(&chunk).await?;
+            file.flush().await?;
         }
     }
 
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index abca0a86b1..92a7bb703a 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -403,16 +403,18 @@ impl SafekeeperPostgresHandler {
         };
 
         // take the latest commit_lsn if don't have stop_pos
-        let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
+        let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
 
         if end_pos < start_pos {
-            warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos);
-            end_pos = start_pos;
+            warn!(
+                "requested start_pos {} is ahead of available WAL end_pos {}",
+                start_pos, end_pos
+            );
         }
 
         info!(
-            "starting streaming from {:?} till {:?}",
-            start_pos, stop_pos
+            "starting streaming from {:?} till {:?}, available WAL ends at {}",
+            start_pos, stop_pos, end_pos
         );
 
         // switch to copy
@@ -547,12 +549,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
             self.end_pos = *self.commit_lsn_watch_rx.borrow();
             if self.end_pos > self.start_pos {
                 // We have something to send.
+                trace!("got end_pos {:?}, streaming", self.end_pos);
                 return Ok(());
             }
 
             // Wait for WAL to appear, now self.end_pos == self.start_pos.
             if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
                 self.end_pos = lsn;
+                trace!("got end_pos {:?}, streaming", self.end_pos);
                 return Ok(());
             }
 
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 687e1ba6b6..61270a8d0b 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -248,6 +248,10 @@ impl PhysicalStorage {
         };
 
         file.write_all(buf).await?;
+        // Note: flush just ensures write above reaches the OS (this is not
+        // needed in case of sync IO as Write::write there calls directly write
+        // syscall, but needed in case of async). It does *not* fsyncs the file.
+        file.flush().await?;
 
         if xlogoff + buf.len() == self.wal_seg_size {
             // If we reached the end of a WAL segment, flush and close it.
@@ -716,6 +720,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
         count -= XLOG_BLCKSZ;
     }
     file.write_all(&ZERO_BLOCK[0..count]).await?;
+    file.flush().await?;
     Ok(())
 }
 

From 20137d9588c78b8b119087dcab2b4f00099f9463 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 6 Jul 2023 19:49:14 +0300
Subject: [PATCH 29/32] Polish tracing helpers (#4651)

Context: comments here: https://github.com/neondatabase/neon/pull/4645
---
 pageserver/src/tenant/span.rs          | 22 +++++++++------------
 pageserver/src/tenant/timeline/span.rs | 27 +++++++++++++-------------
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs
index 1ee8aac31e..3728dd9dd8 100644
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -1,24 +1,20 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+
 #[cfg(not(debug_assertions))]
-#[inline]
 pub(crate) fn debug_assert_current_span_has_tenant_id() {}
 
 #[cfg(debug_assertions)]
-pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
-    utils::tracing_span_assert::MultiNameExtractor<2>,
-> = once_cell::sync::Lazy::new(|| {
-    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
-});
+pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
+    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]));
 
 #[cfg(debug_assertions)]
-#[inline]
+#[track_caller]
 pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    use utils::tracing_span_assert;
-
-    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
-        Ok(()) => (),
-        Err(missing) => panic!(
+    if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        panic!(
             "missing extractors: {:?}",
             missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        ),
+        )
     }
 }
diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs
index 3cd405dc31..9ef0d5f92a 100644
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1,26 +1,25 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
+
 #[cfg(not(debug_assertions))]
-#[inline]
 pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
 
 #[cfg(debug_assertions)]
-#[inline]
+#[track_caller]
 pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    use utils::tracing_span_assert;
+    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
+        once_cell::sync::Lazy::new(|| {
+            MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+        });
 
-    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
-        tracing_span_assert::MultiNameExtractor<2>,
-    > = once_cell::sync::Lazy::new(|| {
-        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
-    });
-
-    match tracing_span_assert::check_fields_present([
+    let fields: [&dyn Extractor; 2] = [
         &*crate::tenant::span::TENANT_ID_EXTRACTOR,
         &*TIMELINE_ID_EXTRACTOR,
-    ]) {
-        Ok(()) => (),
-        Err(missing) => panic!(
+    ];
+    if let Err(missing) = check_fields_present(fields) {
+        panic!(
             "missing extractors: {:?}",
             missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        ),
+        )
     }
 }

From 4f280c2953f3a45adefe50c0309018ee49a73c67 Mon Sep 17 00:00:00 2001
From: arpad-m <arpad-m@users.noreply.github.com>
Date: Fri, 7 Jul 2023 16:53:14 +0200
Subject: [PATCH 30/32] Small pageserver cleanups (#4657)

## Problem

I was reading the code of the page server today and found these minor
things that I thought could be cleaned up.

## Summary of changes

* remove a redundant indentation layer and continue in the flushing loop
* use the builtin `PartialEq` check instead of hand-rolling a `range_eq`
function
* Add a missing `>` to a prominent doc comment
---
 pageserver/src/tenant/layer_map.rs                 |  5 ++---
 pageserver/src/tenant/storage_layer.rs             |  7 -------
 pageserver/src/tenant/storage_layer/delta_layer.rs |  2 +-
 pageserver/src/tenant/timeline.rs                  | 12 ++++--------
 pageserver/src/walredo.rs                          |  4 ++--
 5 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index dee02ac433..1c407d7133 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -60,7 +60,6 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;
 
-use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
 
 ///
@@ -365,7 +364,7 @@ impl LayerMap {
     }
 
     pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
-        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
+        layer.get_key_range() == (Key::MIN..Key::MAX)
     }
 
     /// This function determines which layers are counted in `count_deltas`:
@@ -397,7 +396,7 @@ impl LayerMap {
         }
 
         // Case 2
-        if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
+        if partition_range == &(Key::MIN..Key::MAX) {
             return true;
         }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index d35eccfc72..f2aaa7cec6 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -54,13 +54,6 @@ where
     }
 }
 
-pub fn range_eq<T>(a: &Range<T>, b: &Range<T>) -> bool
-where
-    T: PartialEq<T>,
-{
-    a.start == b.start && a.end == b.end
-}
-
 /// Struct used to communicate across calls to 'get_value_reconstruct_data'.
 ///
 /// Before first call, you can fill in 'page_img' if you have an older cached
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 2591beef3e..c92a2898e4 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -10,7 +10,7 @@
 //! The delta files are stored in timelines/<timeline_id> directory.  Currently,
 //! there are no subdirectories, and each delta file is named like this:
 //!
-//!    <key start>-<key end>__<start LSN>-<end LSN
+//!    <key start>-<key end>__<start LSN>-<end LSN>
 //!
 //! For example:
 //!
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a80708b4a..626729859c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2808,14 +2808,10 @@ impl Timeline {
                     layers.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
-                if let Some(layer_to_flush) = layer_to_flush {
-                    if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
-                        error!("could not flush frozen layer: {err:?}");
-                        break Err(err);
-                    }
-                    continue;
-                } else {
-                    break Ok(());
+                let Some(layer_to_flush) = layer_to_flush else { break Ok(()) };
+                if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
+                    error!("could not flush frozen layer: {err:?}");
+                    break Err(err);
                 }
             };
             // Notify any listeners that we're done
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 98730a7637..8bac8ed42d 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -175,8 +175,8 @@ impl WalRedoManager for PostgresRedoManager {
         let mut img = base_img.map(|p| p.1);
         let mut batch_neon = can_apply_in_neon(&records[0].1);
         let mut batch_start = 0;
-        for i in 1..records.len() {
-            let rec_neon = can_apply_in_neon(&records[i].1);
+        for (i, record) in records.iter().enumerate().skip(1) {
+            let rec_neon = can_apply_in_neon(&record.1);
 
             if rec_neon != batch_neon {
                 let result = if batch_neon {

From ac758e4f5170fe8b3c89f5da4549eb003324901b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 7 Jul 2023 17:50:50 +0100
Subject: [PATCH 31/32] allow repeated IO errors from compute node (#4624)

## Problem

#4598 compute nodes are not accessible some time after wake up due to
kubernetes DNS not being fully propagated.

## Summary of changes

Update connect retry mechanism to support handling IO errors and
sleeping for 100ms

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 proxy/src/http/conn_pool.rs | 69 +++++++++++++++++-----------
 proxy/src/proxy.rs          | 92 ++++++++++++++++++++++++++++++-------
 proxy/src/proxy/tests.rs    | 17 +++++++
 3 files changed, 136 insertions(+), 42 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index 52c1e2f2ce..27950d3a20 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -2,16 +2,15 @@ use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
 use std::{collections::HashMap, sync::Arc};
-
-use futures::TryFutureExt;
+use tokio::time;
 
 use crate::config;
 use crate::{auth, console};
 
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 
-use crate::proxy::invalidate_cache;
-use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
+use crate::proxy::try_wake;
+use crate::proxy::{BASE_RETRY_WAIT_DURATION, NUM_RETRIES_WAKE_COMPUTE};
 
 use tracing::error;
 use tracing::info;
@@ -223,32 +222,59 @@ async fn connect_to_compute(
 
     // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
     // the difference that it uses `tokio_postgres` for the connection.
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(&extra).await? {
-                    // Update `node_info` and try one more time.
-                    Some(new) => {
-                        *node_info = new;
+            Err(e) if num_retries == NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, &extra, &creds).await {
+                            Ok(Some(x)) => should_wake = x,
+                            Ok(None) => return Err(e.into()),
+                            Err(e) => return Err(e.into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e.into());
                 }
             }
-            other => return other,
+            other => return Ok(other?),
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+fn retry_connect_in(err: &tokio_postgres::Error, num_retries: u32) -> Option<time::Duration> {
+    use tokio_postgres::error::SqlState;
+    match err.code() {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        Some(
+            &SqlState::CONNECTION_FAILURE
+            | &SqlState::CONNECTION_EXCEPTION
+            | &SqlState::CONNECTION_DOES_NOT_EXIST
+            | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        ) => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+        }
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
 async fn connect_to_compute_once(
     node_info: &console::CachedNodeInfo,
     conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
+) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
 
     let (client, connection) = config
@@ -257,15 +283,6 @@ async fn connect_to_compute_once(
         .dbname(&conn_info.dbname)
         .max_backend_message_size(MAX_RESPONSE_SIZE)
         .connect(tokio_postgres::NoTls)
-        .inspect_err(|e: &tokio_postgres::Error| {
-            error!(
-                "failed to connect to compute node hosts={:?} ports={:?}: {}",
-                node_info.config.get_hosts(),
-                node_info.config.get_ports(),
-                e
-            );
-            invalidate_cache(node_info)
-        })
         .await?;
 
     tokio::spawn(async move {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2433412c4c..5c5353a63e 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,12 +6,17 @@ use crate::{
     cancellation::{self, CancelMap},
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
-    console::{self, messages::MetricsAuxInfo},
+    console::{
+        self,
+        errors::{ApiError, WakeComputeError},
+        messages::MetricsAuxInfo,
+    },
     error::io_error,
     stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
+use hyper::StatusCode;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
@@ -25,7 +30,9 @@ use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
-pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
+/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
+pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
+pub const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -315,7 +322,6 @@ async fn connect_to_compute_once(
     node_info
         .config
         .connect(allow_self_signed_compute, timeout)
-        .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
         .await
 }
 
@@ -328,7 +334,8 @@ async fn connect_to_compute(
     extra: &console::ConsoleReqExtra<'_>,
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
@@ -346,30 +353,83 @@ async fn connect_to_compute(
         // We only use caching in case of scram proxy backed by the console, so reduce
         // the timeout only in that case.
         let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
-        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
+        let timeout = if is_scram_proxy && num_retries == 0 {
             time::Duration::from_secs(2)
         } else {
             time::Duration::from_secs(10)
         };
 
         match connect_to_compute_once(node_info, timeout).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(extra).map_err(io_error).await? {
-                    // Update `node_info` and try one more time.
-                    Some(mut new) => {
-                        new.config.reuse_password(&node_info.config);
-                        *node_info = new;
+            Err(e) if num_retries < NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, extra, creds).await {
+                            Ok(Some(x)) => {
+                                should_wake = x;
+                            }
+                            Ok(None) => return Err(e),
+                            Err(e) => return Err(io_error(e).into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e);
                 }
             }
             other => return other,
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Some(false)) if the wakeup succeeded
+/// * Returns Ok(None) or Err(e) if there was an error
+pub async fn try_wake(
+    node_info: &mut console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+) -> Result<Option<bool>, WakeComputeError> {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    invalidate_cache(node_info);
+    match creds.wake_compute(extra).await {
+        // retry wake if the compute was in an invalid state
+        Err(WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        })) => Ok(Some(true)),
+        // Update `node_info` and try again.
+        Ok(Some(mut new)) => {
+            new.config.reuse_password(&node_info.config);
+            *node_info = new;
+            Ok(Some(false))
+        }
+        Err(e) => Err(e),
+        Ok(None) => Ok(None),
+    }
+}
+
+fn retry_connect_in(err: &compute::ConnectionError, num_retries: u32) -> Option<time::Duration> {
+    use std::io::ErrorKind;
+    match err {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        compute::ConnectionError::CouldNotConnect(io_err) => match io_err.kind() {
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable => {
+                // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+                Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+            }
+            _ => None,
+        },
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3373c49676..a1f6cd3ed4 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,4 +1,6 @@
 //! A group of high-level tests for connection establishing logic and auth.
+use std::io;
+
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
@@ -294,3 +296,18 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
     Ok(())
 }
+
+#[test]
+fn connect_compute_total_wait() {
+    let err = compute::ConnectionError::CouldNotConnect(io::Error::new(
+        io::ErrorKind::ConnectionRefused,
+        "conn refused",
+    ));
+
+    let mut total_wait = tokio::time::Duration::ZERO;
+    for num_retries in 0..10 {
+        total_wait += retry_connect_in(&err, num_retries).unwrap();
+    }
+    assert!(total_wait < tokio::time::Duration::from_secs(12));
+    assert!(total_wait > tokio::time::Duration::from_secs(10));
+}

From 1f151d03d8410f6fe87622ff0bb8901dbaf7875d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 7 Jul 2023 18:21:15 +0100
Subject: [PATCH 32/32] Dockerfile.compute-node: support arm64 (#4660)

## Problem

`docker build ... -f Dockerfile.compute-node ...` fails on ARM (I'm
checking on macOS).

## Summary of changes
- Download the arm version of cmake on arm
---
 Dockerfile.compute-node | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 7208024d63..1b5db2af81 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -132,10 +132,20 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.ta
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-# packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+RUN case "$(uname -m)" in \
+      "x86_64") \
+        export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
+        ;; \
+      "aarch64") \
+        export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
+        ;; \
+      *) \
+        echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
       -q -O /tmp/cmake-install.sh \
-      && echo "739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 /tmp/cmake-install.sh" | sha256sum --check \
+      && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
       && chmod u+x /tmp/cmake-install.sh \
       && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
       && rm /tmp/cmake-install.sh