add plumber tool

Fix safekeeper recovery with auth (#5035 )
Fix missing a password in walrcv_connect for a safekeeper recovery. Add a test which restarts endpoint and triggers a recovery.
2026-07-19 20:10:37 +00:00 · 2023-08-18 19:33:45 +03:00 · 2023-08-18 16:48:55 +01:00 · 2023-08-18 16:36:31 +02:00 · 2023-08-18 11:44:08 +01:00 · 2023-08-17 19:27:30 +03:00
46 changed files with 1871 additions and 802 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -639,6 +639,12 @@ dependencies = [
 "vsimd",
 ]

+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -886,6 +892,8 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
+ "regex",
+ "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -1010,9 +1018,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"

 [[package]]
 name = "cpufeatures"
-version = "0.2.7"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
+checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
 dependencies = [
 "libc",
 ]
@@ -1192,15 +1200,15 @@ dependencies = [

 [[package]]
 name = "dashmap"
-version = "5.4.0"
+version = "5.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
 "cfg-if",
- "hashbrown 0.12.3",
+ "hashbrown 0.14.0",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -1649,6 +1657,12 @@ dependencies = [
 "ahash",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+
 [[package]]
 name = "hashlink"
 version = "0.8.2"
@@ -2073,9 +2087,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

 [[package]]
 name = "lock_api"
-version = "0.4.9"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
+checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
 dependencies = [
 "autocfg",
 "scopeguard",
@@ -2339,9 +2353,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.17.1"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

 [[package]]
 name = "oorandom"
@@ -2640,7 +2654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -2659,15 +2673,26 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.9.7"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
+checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "smallvec",
- "windows-sys 0.45.0",
+ "windows-targets 0.48.0",
+]
+
+[[package]]
+name = "password-hash"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
+dependencies = [
+ "base64ct",
+ "rand_core",
+ "subtle",
 ]

 [[package]]
@@ -2678,6 +2703,8 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
+ "password-hash",
+ "sha2",
 ]

 [[package]]
@@ -3056,6 +3083,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "dashmap",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -88,7 +89,7 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-pbkdf2 = "0.12.1"
+pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -38,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
+use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -147,6 +147,7 @@ fn main() -> Result<()> {
    match spec_json {
        // First, try to get cluster spec from the cli argument
        Some(json) => {
+            info!("got spec from cli argument {}", json);
            spec = Some(serde_json::from_str(json)?);
        }
        None => {
@@ -182,6 +183,7 @@ fn main() -> Result<()> {

    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        info!("new pspec.spec: {:?}", pspec.spec);
        new_state.pspec = Some(pspec);
        spec_set = true;
    } else {
@@ -196,9 +198,7 @@ fn main() -> Result<()> {
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
        ext_remote_storage,
-        ext_remote_paths: OnceLock::new(),
        ext_download_progress: RwLock::new(HashMap::new()),
-        library_index: OnceLock::new(),
        build_tag,
    };
    let compute = Arc::new(compute_node);
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -5,7 +5,7 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex, OnceLock, RwLock};
+use std::sync::{Condvar, Mutex, RwLock};
 use std::time::Instant;

 use anyhow::{Context, Result};
@@ -14,7 +14,6 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use regex::Regex;
 use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
@@ -60,10 +59,6 @@ pub struct ComputeNode {
    pub state_changed: Condvar,
    ///  the S3 bucket that we search for extensions in
    pub ext_remote_storage: Option<GenericRemoteStorage>,
-    // (key: extension name, value: path to extension archive in remote storage)
-    pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
-    // (key: library name, value: name of extension containing this library)
-    pub library_index: OnceLock<HashMap<String, String>>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -75,7 +70,6 @@ pub struct RemoteExtensionMetrics {
    num_ext_downloaded: u64,
    largest_ext_size: u64,
    total_ext_download_size: u64,
-    prep_extensions_ms: u64,
 }

 #[derive(Clone, Debug)]
@@ -745,11 +739,19 @@ impl ComputeNode {
            pspec.timeline_id,
        );

+        info!(
+            "start_compute spec.remote_extensions {:?}",
+            pspec.spec.remote_extensions
+        );
+
        // This part is sync, because we need to download
        // remote shared_preload_libraries before postgres start (if any)
-        {
+        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
+            // First, create control files for all availale extensions
+            extension_server::create_control_files(remote_extensions, &self.pgbin);
+
            let library_load_start_time = Utc::now();
-            let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
+            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;

            let library_load_time = Utc::now()
                .signed_duration_since(library_load_start_time)
@@ -761,7 +763,6 @@ impl ComputeNode {
            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
            info!(
                "Loading shared_preload_libraries took {:?}ms",
                library_load_time
@@ -918,38 +919,11 @@ LIMIT 100",
        }
    }

-    // If remote extension storage is configured,
-    // download extension control files
-    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
-        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
-            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-            let spec = &pspec.spec;
-            let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
-            info!("custom extensions: {:?}", &custom_ext);
-
-            let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
-                ext_remote_storage,
-                &self.pgbin,
-                &self.pgversion,
-                &custom_ext,
-                &self.build_tag,
-            )
-            .await?;
-            self.ext_remote_paths
-                .set(ext_remote_paths)
-                .expect("this is the only time we set ext_remote_paths");
-            self.library_index
-                .set(library_index)
-                .expect("this is the only time we set library_index");
-        }
-        Ok(())
-    }
-
    // download an archive, unzip and place files in correct locations
    pub async fn download_extension(
        &self,
-        ext_name: &str,
-        is_library: bool,
+        real_ext_name: String,
+        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
        let remote_storage = self
            .ext_remote_storage
@@ -958,35 +932,6 @@ LIMIT 100",
                "Remote extensions storage is not configured",
            )))?;

-        let mut real_ext_name = ext_name;
-        if is_library {
-            // sometimes library names might have a suffix like
-            // library.so or library.so.3. We strip this off
-            // because library_index is based on the name without the file extension
-            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
-            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
-
-            real_ext_name = self
-                .library_index
-                .get()
-                .expect("must have already downloaded the library_index")
-                .get(&lib_raw_name)
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                    "library {} is not found",
-                    lib_raw_name
-                )))?;
-        }
-
-        let ext_path = &self
-            .ext_remote_paths
-            .get()
-            .expect("error accessing ext_remote_paths")
-            .get(real_ext_name)
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "real_ext_name {} is not found",
-                real_ext_name
-            )))?;
-
        let ext_archive_name = ext_path.object_name().expect("bad path");

        let mut first_try = false;
@@ -1039,8 +984,8 @@ LIMIT 100",
        info!("downloading new extension {ext_archive_name}");

        let download_size = extension_server::download_extension(
-            real_ext_name,
-            ext_path,
+            &real_ext_name,
+            &ext_path,
            remote_storage,
            &self.pgbin,
        )
@@ -1058,18 +1003,19 @@ LIMIT 100",
    #[tokio::main]
    pub async fn prepare_preload_libraries(
        &self,
-        compute_state: &ComputeState,
+        spec: &ComputeSpec,
    ) -> Result<RemoteExtensionMetrics> {
        if self.ext_remote_storage.is_none() {
            return Ok(RemoteExtensionMetrics {
                num_ext_downloaded: 0,
                largest_ext_size: 0,
                total_ext_download_size: 0,
-                prep_extensions_ms: 0,
            });
        }
-        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-        let spec = &pspec.spec;
+        let remote_extensions = spec
+            .remote_extensions
+            .as_ref()
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -1081,6 +1027,7 @@ LIMIT 100",
                .collect();
        }
        info!("parse shared_preload_libraries from provided postgresql.conf");
+
        // that is used in neon_local and python tests
        if let Some(conf) = &spec.cluster.postgresql_conf {
            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
@@ -1101,30 +1048,16 @@ LIMIT 100",
            libs_vec.extend(preload_libs_vec);
        }

-        info!("Download ext_index.json, find the extension paths");
-        let prep_ext_start_time = Utc::now();
-        self.prepare_external_extensions(compute_state).await?;
-        let prep_ext_time_delta = Utc::now()
-            .signed_duration_since(prep_ext_start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-        info!("Prepare extensions took {prep_ext_time_delta}ms");
-
        // Don't try to download libraries that are not in the index.
        // Assume that they are already present locally.
-        libs_vec.retain(|lib| {
-            self.library_index
-                .get()
-                .expect("error accessing ext_remote_paths")
-                .contains_key(lib)
-        });
+        libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));

        info!("Downloading to shared preload libraries: {:?}", &libs_vec);

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            download_tasks.push(self.download_extension(library, true));
+            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;

@@ -1132,7 +1065,6 @@ LIMIT 100",
            num_ext_downloaded: 0,
            largest_ext_size: 0,
            total_ext_download_size: 0,
-            prep_extensions_ms: prep_ext_time_delta,
        };
        for result in results {
            let download_size = match result {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -73,10 +73,9 @@ More specifically, here is an example ext_index.json
 */
 use anyhow::Context;
 use anyhow::{self, Result};
-use futures::future::join_all;
+use compute_api::spec::RemoteExtSpec;
 use remote_storage::*;
 use serde_json;
-use std::collections::HashMap;
 use std::io::Read;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::Path;
@@ -117,81 +116,6 @@ pub fn get_pg_version(pgbin: &str) -> String {
    panic!("Unsuported postgres version {human_version}");
 }

-// download control files for enabled_extensions
-// return Hashmaps converting library names to extension names (library_index)
-// and specifying the remote path to the archive for each extension name
-pub async fn get_available_extensions(
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-    pg_version: &str,
-    custom_extensions: &[String],
-    build_tag: &str,
-) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
-    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
-    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
-    info!("download ext_index.json from: {:?}", &index_path);
-
-    let mut download = remote_storage.download(&index_path).await?;
-    let mut ext_idx_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut ext_idx_buffer)
-        .await?;
-    info!("ext_index downloaded");
-
-    #[derive(Debug, serde::Deserialize)]
-    struct Index {
-        public_extensions: Vec<String>,
-        library_index: HashMap<String, String>,
-        extension_data: HashMap<String, ExtensionData>,
-    }
-
-    #[derive(Debug, serde::Deserialize)]
-    struct ExtensionData {
-        control_data: HashMap<String, String>,
-        archive_path: String,
-    }
-
-    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
-    let mut enabled_extensions = ext_index_full.public_extensions;
-    enabled_extensions.extend_from_slice(custom_extensions);
-    let mut library_index = ext_index_full.library_index;
-    let all_extension_data = ext_index_full.extension_data;
-    info!("library_index: {:?}", library_index);
-
-    info!("enabled_extensions: {:?}", enabled_extensions);
-    let mut ext_remote_paths = HashMap::new();
-    let mut file_create_tasks = Vec::new();
-    for extension in enabled_extensions {
-        let ext_data = &all_extension_data[&extension];
-        for (control_file, control_contents) in &ext_data.control_data {
-            let extension_name = control_file
-                .strip_suffix(".control")
-                .expect("control files must end in .control");
-            let control_path = local_sharedir.join(control_file);
-            if !control_path.exists() {
-                ext_remote_paths.insert(
-                    extension_name.to_string(),
-                    RemotePath::from_string(&ext_data.archive_path)?,
-                );
-                info!("writing file {:?}{:?}", control_path, control_contents);
-                file_create_tasks.push(tokio::fs::write(control_path, control_contents));
-            } else {
-                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
-                // also delete this from library index
-                library_index.retain(|_, value| value != extension_name);
-            }
-        }
-    }
-    let results = join_all(file_create_tasks).await;
-    for result in results {
-        result?;
-    }
-    info!("ext_remote_paths {:?}", ext_remote_paths);
-    Ok((ext_remote_paths, library_index))
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
@@ -253,6 +177,22 @@ pub async fn download_extension(
    Ok(download_size)
 }

+// Create extension control files from spec
+pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    for ext_data in remote_extensions.extension_data.values() {
+        for (control_name, control_content) in &ext_data.control_data {
+            let control_path = local_sharedir.join(control_name);
+            if !control_path.exists() {
+                info!("writing file {:?}{:?}", control_path, control_content);
+                std::fs::write(control_path, control_content).unwrap();
+            } else {
+                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
+            }
+        }
+    }
+}
+
 // This function initializes the necessary structs to use remote storage
 pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
    #[derive(Debug, serde::Deserialize)]
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use num_cpus;
 use serde_json;
 use tokio::task;
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -126,6 +126,15 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());

+            // don't even try to download extensions
+            // if no remote storage is configured
+            if compute.ext_remote_storage.is_none() {
+                info!("no extensions remote storage configured");
+                let mut resp = Response::new(Body::from("no remote storage configured"));
+                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return resp;
+            }
+
            let mut is_library = false;
            if let Some(params) = req.uri().query() {
                info!("serving {:?} POST request with params: {}", route, params);
@@ -137,24 +146,47 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    return resp;
                }
            }
-
            let filename = route.split('/').last().unwrap().to_string();
            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");

-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
+            // get ext_name and path from spec
+            // don't lock compute_state for too long
+            let ext = {
+                let compute_state = compute.state.lock().unwrap();
+                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+                let spec = &pspec.spec;

-            match compute.download_extension(&filename, is_library).await {
-                Ok(_) => Response::new(Body::from("OK")),
+                // debug only
+                info!("spec: {:?}", spec);
+
+                let remote_extensions = match spec.remote_extensions.as_ref() {
+                    Some(r) => r,
+                    None => {
+                        info!("no remote extensions spec was provided");
+                        let mut resp = Response::new(Body::from("no remote storage configured"));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        return resp;
+                    }
+                };
+
+                remote_extensions.get_ext(&filename, is_library)
+            };
+
+            match ext {
+                Ok((ext_name, ext_path)) => {
+                    match compute.download_extension(ext_name, ext_path).await {
+                        Ok(_) => Response::new(Body::from("OK")),
+                        Err(e) => {
+                            error!("extension download failed: {}", e);
+                            let mut resp = Response::new(Body::from(e.to_string()));
+                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                            resp
+                        }
+                    }
+                }
                Err(e) => {
-                    error!("extension download failed: {}", e);
-                    let mut resp = Response::new(Body::from(e.to_string()));
+                    warn!("extension download failed to find extension: {}", e);
+                    let mut resp = Response::new(Body::from("failed to find file"));
                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
                    resp
                }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -493,7 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            custom_extensions: Some(vec![]),
+            remote_extensions: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -10,6 +10,9 @@ chrono.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
+regex.workspace = true

 utils = { path = "../utils" }
+remote_storage = { version = "0.1", path = "../remote_storage/" }
+
 workspace_hack.workspace = true
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -107,7 +107,6 @@ pub struct ComputeMetrics {
    pub num_ext_downloaded: u64,
    pub largest_ext_size: u64, // these are measured in bytes
    pub total_ext_download_size: u64,
-    pub prep_extensions_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -3,11 +3,16 @@
 //! The spec.json file is used to pass information to 'compute_ctl'. It contains
 //! all the information needed to start up the right version of PostgreSQL,
 //! and connect it to the storage nodes.
+use std::collections::HashMap;
+
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

+use regex::Regex;
+use remote_storage::RemotePath;
+
 /// String type alias representing Postgres identifier and
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
@@ -61,8 +66,55 @@ pub struct ComputeSpec {
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,

-    // list of prefixes to search for custom extensions in remote extension storage
+    // information about available remote extensions
+    pub remote_extensions: Option<RemoteExtSpec>,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub struct RemoteExtSpec {
+    pub public_extensions: Option<Vec<String>>,
    pub custom_extensions: Option<Vec<String>>,
+    pub library_index: HashMap<String, String>,
+    pub extension_data: HashMap<String, ExtensionData>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ExtensionData {
+    pub control_data: HashMap<String, String>,
+    pub archive_path: String,
+}
+
+impl RemoteExtSpec {
+    pub fn get_ext(
+        &self,
+        ext_name: &str,
+        is_library: bool,
+    ) -> anyhow::Result<(String, RemotePath)> {
+        let mut real_ext_name = ext_name;
+        if is_library {
+            // sometimes library names might have a suffix like
+            // library.so or library.so.3. We strip this off
+            // because library_index is based on the name without the file extension
+            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
+
+            real_ext_name = self
+                .library_index
+                .get(&lib_raw_name)
+                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
+        }
+
+        match self.extension_data.get(real_ext_name) {
+            Some(ext_data) => Ok((
+                real_ext_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            )),
+            None => Err(anyhow::anyhow!(
+                "real_ext_name {} is not found",
+                real_ext_name
+            )),
+        }
+    }
 }

 #[serde_as]
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -205,5 +205,43 @@
            "name": "zenith new",
            "new_name": "zenith \"new\""
        }
-    ]
+    ],
+    "remote_extensions": {
+        "library_index": {
+          "anon": "anon",
+          "postgis-3": "postgis",
+          "libpgrouting-3.4": "postgis",
+          "postgis_raster-3": "postgis",
+          "postgis_sfcgal-3": "postgis",
+          "postgis_topology-3": "postgis",
+          "address_standardizer-3": "postgis"
+        },
+        "extension_data": {
+          "anon": {
+            "archive_path": "5834329303/v15/extensions/anon.tar.zst",
+            "control_data": {
+              "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
+            }
+          },
+          "postgis": {
+            "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
+            "control_data": {
+              "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
+              "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
+              "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
+              "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
+              "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
+              "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
+              "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
+              "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
+            }
+          }
+        },
+        "custom_extensions": [
+          "anon"
+        ],
+        "public_extensions": [
+          "postgis"
+        ]
+      }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -145,13 +145,6 @@ pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
-pub const XLH_INSERT_STORE_CID: u8 = (1 << 7) as u8;
-pub const XLH_UPDATE_STORE_CID: u8 = (1 << 7) as u8;
-pub const XLH_DELETE_STORE_CID: u8 = (1 << 7) as u8;
-pub const XLH_LOCK_STORE_CID: u8 = (1 << 7) as u8;
-
-pub const SIZE_OF_HEAP_LOCK: usize = 14;
-pub const SIZE_OF_HEAP_DELETE: usize = 14;

 // From replication/message.h
 pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -71,6 +71,13 @@ impl UnreliableWrapper {
            }
        }
    }
+
+    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+        if attempt {
+            self.attempt(RemoteOp::Delete(path.clone()))?;
+        }
+        self.inner.delete(path).await
+    }
 }

 #[async_trait::async_trait]
@@ -122,15 +129,15 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::Delete(path.clone()))?;
-        self.inner.delete(path).await
+        self.delete_inner(path, true).await
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
-            if (self.delete(path).await).is_err() {
+            // Dont record attempt because it was already recorded above
+            if (self.delete_inner(path, false).await).is_err() {
                error_counter += 1;
            }
        }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,7 +53,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::tenant::writeback_ephemeral_file;
+use crate::tenant::{block_io, ephemeral_file, writeback_ephemeral_file};
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -98,11 +98,11 @@ enum CacheKey {
        lsn: Lsn,
    },
    EphemeralPage {
-        file_id: u64,
+        file_id: ephemeral_file::FileId,
        blkno: u32,
    },
    ImmutableFilePage {
-        file_id: u64,
+        file_id: block_io::FileId,
        blkno: u32,
    },
 }
@@ -177,9 +177,9 @@ pub struct PageCache {
    /// can have a separate mapping map, next to this field.
    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
+    ephemeral_page_map: RwLock<HashMap<(ephemeral_file::FileId, u32), usize>>,

-    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
+    immutable_page_map: RwLock<HashMap<(block_io::FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -390,20 +390,28 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with Ephemeral pages.

-    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
+    pub fn read_ephemeral_buf(
+        &self,
+        file_id: ephemeral_file::FileId,
+        blkno: u32,
+    ) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

-    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
+    pub fn write_ephemeral_buf(
+        &self,
+        file_id: ephemeral_file::FileId,
+        blkno: u32,
+    ) -> anyhow::Result<WriteBufResult> {
        let cache_key = CacheKey::EphemeralPage { file_id, blkno };

        self.lock_for_write(&cache_key)
    }

    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
+    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: ephemeral_file::FileId) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

@@ -424,14 +432,18 @@ impl PageCache {

    // Section 1.3: Public interface functions for working with immutable file pages.

-    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
+    pub fn read_immutable_buf(
+        &self,
+        file_id: block_io::FileId,
+        blkno: u32,
+    ) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: block_io::FileId) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1104,8 +1104,9 @@ impl Tenant {
            {
                match e {
                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)
-                            .context("Failed to load local timeline: {timeline_id}"))
+                        return Err(anyhow::anyhow!(source)).with_context(|| {
+                            format!("Failed to load local timeline: {timeline_id}")
+                        })
                    }
                    LoadLocalTimelineError::ResumeDeletion(source) => {
                        // Make sure resumed deletion wont fail loading for entire tenant.
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -117,6 +117,12 @@ where
    }
 }
 static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub struct FileId(u64);
+
+fn next_file_id() -> FileId {
+    FileId(NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed))
+}

 /// An adapter for reading a (virtual) file using the page cache.
 ///
@@ -126,7 +132,7 @@ pub struct FileBlockReader<F> {
    pub file: F,

    /// Unique ID of this file, used as key in the page cache.
-    file_id: u64,
+    file_id: FileId,
 }

 impl<F> FileBlockReader<F>
@@ -134,7 +140,7 @@ where
    F: FileExt,
 {
    pub fn new(file: F) -> Self {
-        let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let file_id = next_file_id();

        FileBlockReader { file_id, file }
    }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -238,6 +238,30 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

+pub(crate) async fn remote_delete_mark_exists(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+    remote_storage: &GenericRemoteStorage,
+) -> anyhow::Result<bool> {
+    // If remote storage is there we rely on it
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
+
+    let result = backoff::retry(
+        || async { remote_storage.download(&remote_mark_path).await },
+        |e| matches!(e, DownloadError::NotFound),
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        "fetch_tenant_deletion_mark",
+    )
+    .await;
+
+    match result {
+        Ok(_) => Ok(true),
+        Err(DownloadError::NotFound) => Ok(false),
+        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
+    }
+}
+
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -372,22 +396,10 @@ impl DeleteTenantFlow {
            None => return Ok(None),
        };

-        // If remote storage is there we rely on it
-        let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
-
-        let result = backoff::retry(
-            || async { remote_storage.download(&remote_mark_path).await },
-            |e| matches!(e, DownloadError::NotFound),
-            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-            "fetch_tenant_deletion_mark",
-        )
-        .await;
-
-        match result {
-            Ok(_) => Ok(acquire(tenant)),
-            Err(DownloadError::NotFound) => Ok(None),
-            Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?,
+        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
+            Ok(acquire(tenant))
+        } else {
+            Ok(None)
        }
    }

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -12,31 +12,39 @@ use std::collections::HashMap;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
+use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
 use tracing::*;
 use utils::id::{TenantId, TimelineId};

-use std::os::unix::fs::FileExt;
-
 ///
 /// This is the global cache of file descriptors (File objects).
 ///
 static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
    RwLock::new(EphemeralFiles {
-        next_file_id: 1,
+        next_file_id: FileId(1),
        files: HashMap::new(),
    })
 });

-pub struct EphemeralFiles {
-    next_file_id: u64,
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct FileId(u64);

-    files: HashMap<u64, Arc<VirtualFile>>,
+impl std::fmt::Display for FileId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+pub struct EphemeralFiles {
+    next_file_id: FileId,
+
+    files: HashMap<FileId, Arc<VirtualFile>>,
 }

 pub struct EphemeralFile {
-    file_id: u64,
+    file_id: FileId,
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
    file: Arc<VirtualFile>,
@@ -52,7 +60,7 @@ impl EphemeralFile {
    ) -> Result<EphemeralFile, io::Error> {
        let mut l = EPHEMERAL_FILES.write().unwrap();
        let file_id = l.next_file_id;
-        l.next_file_id += 1;
+        l.next_file_id = FileId(l.next_file_id.0 + 1);

        let filename = conf
            .timeline_path(&tenant_id, &timeline_id)
@@ -94,7 +102,10 @@ impl EphemeralFile {
        Ok(())
    }

-    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
+    fn get_buf_for_write(
+        &self,
+        blkno: u32,
+    ) -> Result<page_cache::PageWriteGuard<'static>, io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        let mut write_guard = match cache
@@ -127,121 +138,79 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

-impl FileExt for EphemeralFile {
-    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, dstbuf.len());
-
-        let read_guard;
-        let mut write_guard;
-
-        let cache = page_cache::get();
-        let buf = match cache
-            .read_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
-        {
-            ReadBufResult::Found(guard) => {
-                read_guard = guard;
-                read_guard.as_ref()
-            }
-            ReadBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to read the requested slice from the
-                // buffer.
-                write_guard.as_ref()
-            }
-        };
-
-        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
-        Ok(len)
-    }
-
-    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, srcbuf.len());
-
-        let mut write_guard;
-        let cache = page_cache::get();
-        let buf = match cache
-            .write_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
-        {
-            WriteBufResult::Found(guard) => {
-                write_guard = guard;
-                write_guard.deref_mut()
-            }
-            WriteBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                // TODO: if we're overwriting the whole page, no need to read it in first
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to modify it.
-                write_guard.deref_mut()
-            }
-        };
-
-        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
-        write_guard.mark_dirty();
-        Ok(len)
-    }
-}
-
 impl BlobWriter for EphemeralFile {
    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+        struct Writer<'a> {
+            ephemeral_file: &'a mut EphemeralFile,
+            /// The block to which the next [`push_bytes`] will write.
+            blknum: u32,
+            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
+            off: usize,
+            /// Used by [`push_bytes`] to memoize the page cache write guard across calls to it.
+            memo_page_guard: MemoizedPageWriteGuard,
+        }
+        struct MemoizedPageWriteGuard {
+            guard: page_cache::PageWriteGuard<'static>,
+            /// The block number of the page in `guard`.
+            blknum: u32,
+        }
+        impl<'a> Writer<'a> {
+            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
+                let blknum = (ephemeral_file.size / PAGE_SZ as u64) as u32;
+                Ok(Writer {
+                    blknum,
+                    off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
+                    memo_page_guard: MemoizedPageWriteGuard {
+                        guard: ephemeral_file.get_buf_for_write(blknum)?,
+                        blknum,
+                    },
+                    ephemeral_file,
+                })
+            }
+            #[inline(always)]
+            fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+                // `src_remaining` is the remaining bytes to be written
+                let mut src_remaining = src;
+                while !src_remaining.is_empty() {
+                    let page = if self.memo_page_guard.blknum == self.blknum {
+                        &mut self.memo_page_guard.guard
+                    } else {
+                        self.memo_page_guard.guard =
+                            self.ephemeral_file.get_buf_for_write(self.blknum)?;
+                        self.memo_page_guard.blknum = self.blknum;
+                        &mut self.memo_page_guard.guard
+                    };
+                    let dst_remaining = &mut page[self.off..];
+                    let n = min(dst_remaining.len(), src_remaining.len());
+                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
+                    self.off += n;
+                    src_remaining = &src_remaining[n..];
+                    if self.off == PAGE_SZ {
+                        // This block is done, move to next one.
+                        self.blknum += 1;
+                        self.off = 0;
+                    }
+                }
+                Ok(())
+            }
+        }
+
        let pos = self.size;
-
-        let mut blknum = (self.size / PAGE_SZ as u64) as u32;
-        let mut off = (pos % PAGE_SZ as u64) as usize;
-
-        let mut buf = self.get_buf_for_write(blknum)?;
+        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
-            buf[off] = srcbuf.len() as u8;
-            off += 1;
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];
+            writer.push_bytes(&len_buf)?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            let thislen = PAGE_SZ - off;
-            if thislen < 4 {
-                // it needs to be split across pages
-                buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
-                blknum += 1;
-                buf = self.get_buf_for_write(blknum)?;
-                buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
-                off = 4 - thislen;
-            } else {
-                buf[off..off + 4].copy_from_slice(&len_buf);
-                off += 4;
-            }
+            writer.push_bytes(&len_buf)?;
        }

        // Write the payload
-        let mut buf_remain = srcbuf;
-        while !buf_remain.is_empty() {
-            let mut page_remain = PAGE_SZ - off;
-            if page_remain == 0 {
-                blknum += 1;
-                buf = self.get_buf_for_write(blknum)?;
-                off = 0;
-                page_remain = PAGE_SZ;
-            }
-            let this_blk_len = min(page_remain, buf_remain.len());
-            buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
-            off += this_blk_len;
-            buf_remain = &buf_remain[this_blk_len..];
-        }
-        drop(buf);
+        writer.push_bytes(srcbuf)?;

        if srcbuf.len() < 0x80 {
            self.size += 1;
@@ -281,7 +250,7 @@ impl Drop for EphemeralFile {
    }
 }

-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
+pub fn writeback(file_id: FileId, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
            Ok(_) => Ok(()),
@@ -334,7 +303,7 @@ mod tests {
    use super::*;
    use crate::tenant::blob_io::BlobWriter;
    use crate::tenant::block_io::BlockCursor;
-    use rand::{seq::SliceRandom, thread_rng, RngCore};
+    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -355,50 +324,6 @@ mod tests {
        Ok((conf, tenant_id, timeline_id))
    }

-    // Helper function to slurp contents of a file, starting at the current position,
-    // into a string
-    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
-        let mut buf = Vec::new();
-        buf.resize(len, 0u8);
-
-        efile.read_exact_at(&mut buf, offset)?;
-
-        Ok(String::from_utf8_lossy(&buf)
-            .trim_end_matches('\0')
-            .to_string())
-    }
-
-    #[test]
-    fn test_ephemeral_files() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
-
-        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
-
-        file_a.write_all_at(b"foo", 0)?;
-        assert_eq!("foo", read_string(&file_a, 0, 20)?);
-
-        file_a.write_all_at(b"bar", 3)?;
-        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
-
-        // Open a lot of files, enough to cause some page evictions.
-        let mut efiles = Vec::new();
-        for fileno in 0..100 {
-            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
-            efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
-            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
-            efiles.push((fileno, efile));
-        }
-
-        // Check that all the files can still be read from. Use them in random order for
-        // good measure.
-        efiles.as_mut_slice().shuffle(&mut thread_rng());
-        for (fileno, efile) in efiles.iter_mut() {
-            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
-        }
-
-        Ok(())
-    }
-
    #[tokio::test]
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -27,7 +27,7 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::delete::DeleteTenantError;
+use super::delete::{remote_delete_mark_exists, DeleteTenantError};
 use super::timeline::delete::DeleteTimelineFlow;

 /// The tenants known to the pageserver.
@@ -591,6 +591,12 @@ pub async fn attach_tenant(
    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
+    // Temporary solution, proper one would be to resume deletion, but that needs more plumbing around Tenant::load/Tenant::attach
+    // Corresponding issue https://github.com/neondatabase/neon/issues/5006
+    if remote_delete_mark_exists(conf, &tenant_id, &remote_storage).await? {
+        return Err(anyhow::anyhow!("Tenant is marked as deleted on remote storage").into());
+    }
+
    tenant_map_insert(tenant_id, || {
        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -222,7 +222,6 @@ use std::sync::{Arc, Mutex};

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
-use tokio::runtime::Runtime;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
@@ -311,7 +310,7 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
    conf: &'static PageServerConf,

-    runtime: &'static Runtime,
+    runtime: tokio::runtime::Handle,

    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -338,7 +337,7 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: &BACKGROUND_RUNTIME,
+            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
            tenant_id,
            timeline_id,
            storage_impl: remote_storage,
@@ -851,7 +850,7 @@ impl RemoteTimelineClient {
        let remaining = backoff::retry(
            || async {
                self.storage_impl
-                    .list_prefixes(Some(&timeline_storage_path))
+                    .list_files(Some(&timeline_storage_path))
                    .await
            },
            |_e| false,
@@ -994,7 +993,7 @@ impl RemoteTimelineClient {
            let tenant_id = self.tenant_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
-                self.runtime.handle(),
+                &self.runtime,
                TaskKind::RemoteUploadTask,
                Some(self.tenant_id),
                Some(self.timeline_id),
@@ -1347,7 +1346,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant,
+            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
@@ -1356,7 +1355,6 @@ mod tests {
        collections::HashSet,
        path::{Path, PathBuf},
    };
-    use tokio::runtime::EnterGuard;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1406,35 +1404,25 @@ mod tests {
    }

    struct TestSetup {
-        runtime: &'static tokio::runtime::Runtime,
-        entered_runtime: EnterGuard<'static>,
        harness: TenantHarness,
        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
        client: Arc<RemoteTimelineClient>,
    }

    impl TestSetup {
-        fn new(test_name: &str) -> anyhow::Result<Self> {
+        async fn new(test_name: &str) -> anyhow::Result<Self> {
            // Use a current-thread runtime in the test
-            let runtime = Box::leak(Box::new(
-                tokio::runtime::Builder::new_current_thread()
-                    .enable_all()
-                    .build()?,
-            ));
-            let entered_runtime = runtime.enter();
-
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
            let harness = TenantHarness::create(test_name)?;
-            let (tenant, ctx) = runtime.block_on(harness.load());
+            let (tenant, ctx) = harness.load().await;
+
            // create an empty timeline directory
-            let _ = runtime.block_on(tenant.create_test_timeline(
-                TIMELINE_ID,
-                Lsn(8),
-                DEFAULT_PG_VERSION,
-                &ctx,
-            ))?;
+            let timeline = tenant
+                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+                .await?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1456,7 +1444,7 @@ mod tests {

            let client = Arc::new(RemoteTimelineClient {
                conf: harness.conf,
-                runtime,
+                runtime: tokio::runtime::Handle::current(),
                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
                storage_impl: storage,
@@ -1468,10 +1456,9 @@ mod tests {
            });

            Ok(Self {
-                runtime,
-                entered_runtime,
                harness,
                tenant,
+                timeline,
                tenant_ctx: ctx,
                remote_fs_dir,
                client,
@@ -1480,8 +1467,8 @@ mod tests {
    }

    // Test scheduling
-    #[test]
-    fn upload_scheduling() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn upload_scheduling() {
        // Test outline:
        //
        // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1497,25 +1484,26 @@ mod tests {
        // Schedule index upload. Check that it's queued

        let TestSetup {
-            runtime,
-            entered_runtime: _entered_runtime,
            harness,
            tenant: _tenant,
+            timeline: _timeline,
            tenant_ctx: _tenant_ctx,
            remote_fs_dir,
            client,
-        } = TestSetup::new("upload_scheduling").unwrap();
+        } = TestSetup::new("upload_scheduling").await.unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

        let remote_timeline_dir =
-            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
+            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1524,26 +1512,32 @@ mod tests {
        let content_1 = dummy_contents("foo");
        let content_2 = dummy_contents("bar");
        let content_3 = dummy_contents("baz");
-        std::fs::write(
-            timeline_path.join(layer_file_name_1.file_name()),
-            &content_1,
-        )?;
-        std::fs::write(
-            timeline_path.join(layer_file_name_2.file_name()),
-            &content_2,
-        )?;
-        std::fs::write(timeline_path.join(layer_file_name_3.file_name()), content_3)?;

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_2,
-            &LayerFileMetadata::new(content_2.len() as u64),
-        )?;
+        for (filename, content) in [
+            (&layer_file_name_1, &content_1),
+            (&layer_file_name_2, &content_2),
+            (&layer_file_name_3, &content_3),
+        ] {
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
+        }
+
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
+            .unwrap();
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_2,
+                &LayerFileMetadata::new(content_2.len() as u64),
+            )
+            .unwrap();

        // Check that they are started immediately, not queued
+        //
+        // this works because we running within block_on, so any futures are now queued up until
+        // our next await point.
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1557,7 +1551,9 @@ mod tests {

        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload_for_metadata_update(&metadata)?;
+        client
+            .schedule_index_upload_for_metadata_update(&metadata)
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1566,7 +1562,7 @@ mod tests {
        }

        // Wait for the uploads to finish
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1576,7 +1572,7 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match runtime.block_on(client.download_index_file())? {
+        let index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1588,17 +1584,19 @@ mod tests {
                &layer_file_name_2.file_name(),
            ],
        );
-        let downloaded_metadata = index_part.parse_metadata()?;
+        let downloaded_metadata = index_part.parse_metadata().unwrap();
        assert_eq!(downloaded_metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
-        let content_baz = dummy_contents("baz");
-        std::fs::write(timeline_path.join("baz"), &content_baz)?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_3,
-            &LayerFileMetadata::new(content_baz.len() as u64),
-        )?;
-        client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?;
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_3,
+                &LayerFileMetadata::new(content_3.len() as u64),
+            )
+            .unwrap();
+        client
+            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1620,7 +1618,7 @@ mod tests {
        );

        // Finish them
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        assert_remote_files(
            &[
@@ -1630,23 +1628,24 @@ mod tests {
            ],
            &remote_timeline_dir,
        );
-
-        Ok(())
    }

-    #[test]
-    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
        // Setup

        let TestSetup {
-            runtime,
            harness,
+            tenant: _tenant,
+            timeline: _timeline,
            client,
            ..
-        } = TestSetup::new("metrics")?;
+        } = TestSetup::new("metrics").await.unwrap();

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

@@ -1655,7 +1654,8 @@ mod tests {
        std::fs::write(
            timeline_path.join(layer_file_name_1.file_name()),
            &content_1,
-        )?;
+        )
+        .unwrap();

        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
@@ -1681,14 +1681,16 @@ mod tests {

        let init = get_bytes_started_stopped();

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
+            .unwrap();

        let pre = get_bytes_started_stopped();

-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        let post = get_bytes_started_stopped();

@@ -1716,7 +1718,5 @@ mod tests {
                finished: Some(content_1.len())
            }
        );
-
-        Ok(())
    }
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -549,30 +549,20 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
-
-    /// Obtains all keys and value references stored in the layer
+    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub async fn load_val_refs(
+    pub(crate) async fn load_keys(
        &self,
        ctx: &RequestContext,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
-        let inner = self
-            .load(LayerAccessKind::Iter, ctx)
-            .await
-            .context("load delta layer")?;
-        DeltaLayerInner::load_val_refs(inner)
-            .await
-            .context("Layer index is corrupted")
-    }
-
-    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+    ) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
            .await
            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
+
+        let inner = Ref(&**inner);
+        DeltaLayerInner::load_keys(&inner)
            .await
            .context("Layer index is corrupted")
    }
@@ -711,6 +701,17 @@ impl DeltaLayerWriterInner {
            .metadata()
            .context("get file metadata to determine size")?;

+        // 5GB limit for objects without multipart upload (which we don't want to use)
+        // Make it a little bit below to account for differing GB units
+        // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
+        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
+        ensure!(
+            metadata.len() <= S3_UPLOAD_LIMIT,
+            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
+            file.path.display(),
+            metadata.len()
+        );
+
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -955,15 +956,17 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
+    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
        this: &T,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
+    ) -> Result<Vec<DeltaEntry<T>>> {
        let dl = this.as_ref();
        let file = &dl.file;
+
        let tree_reader =
            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

-        let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
+        let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
+
        tree_reader
            .visit(
                &[0u8; DELTA_KEY_SIZE],
@@ -974,54 +977,63 @@ impl DeltaLayerInner {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(Adapter(this.clone())),
                    };
-                    all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
-                    true
-                },
-            )
-            .await?;
-
-        Ok(all_offsets)
-    }
-
-    pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
-                        if last.0 == delta_key.key() {
-                            return true;
-                        } else {
-                            // subtract offset of new key BLOB and first blob of this key
-                            // to get total size if values associated with this key
-                            let first_pos = last.2;
-                            last.2 = pos - first_pos;
-                        }
+                        // subtract offset of the current and last entries to get the size
+                        // of the value associated with this (key, lsn) tuple
+                        let first_pos = last.size;
+                        last.size = pos - first_pos;
                    }
-                    all_keys.push((delta_key.key(), delta_key.lsn(), pos));
+                    let entry = DeltaEntry {
+                        key: delta_key.key(),
+                        lsn: delta_key.lsn(),
+                        size: pos,
+                        val: val_ref,
+                    };
+                    all_keys.push(entry);
                    true
                },
            )
            .await?;
        if let Some(last) = all_keys.last_mut() {
-            // Last key occupies all space till end of layer
-            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
+            // Last key occupies all space till end of value storage,
+            // which corresponds to beginning of the index
+            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
 }

+/// Cloneable borrow wrapper to make borrows behave like smart pointers.
+///
+/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
+/// cloning DeltaLayerInner.
+pub(crate) struct Ref<T>(T);
+
+impl<'a, T> AsRef<T> for Ref<&'a T> {
+    fn as_ref(&self) -> &T {
+        self.0
+    }
+}
+
+impl<'a, T> Clone for Ref<&'a T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<'a, T> Copy for Ref<&'a T> {}
+
+/// A set of data associated with a delta layer key and its value
+pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
+    pub key: Key,
+    pub lsn: Lsn,
+    /// Size of the stored value
+    pub size: u64,
+    /// Reference to the on-disk value
+    pub val: ValueRef<T>,
+}
+
 /// Reference to an on-disk value
 pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
    blob_ref: BlobRef,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -39,6 +39,7 @@ use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
+use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
    LayerAccessStats, LayerFileName, RemoteLayer,
@@ -3312,10 +3313,10 @@ struct CompactLevel0Phase1StatsBuilder {
    timeline_id: Option<TimelineId>,
    read_lock_acquisition_micros: DurationRecorder,
    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
+    read_lock_held_key_sort_micros: DurationRecorder,
    read_lock_held_prerequisites_micros: DurationRecorder,
    read_lock_held_compute_holes_micros: DurationRecorder,
    read_lock_drop_micros: DurationRecorder,
-    prepare_iterators_micros: DurationRecorder,
    write_layer_files_micros: DurationRecorder,
    level0_deltas_count: Option<usize>,
    new_deltas_count: Option<usize>,
@@ -3332,10 +3333,10 @@ struct CompactLevel0Phase1Stats {
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
+    read_lock_held_key_sort_micros: RecordedDuration,
    read_lock_held_prerequisites_micros: RecordedDuration,
    read_lock_held_compute_holes_micros: RecordedDuration,
    read_lock_drop_micros: RecordedDuration,
-    prepare_iterators_micros: RecordedDuration,
    write_layer_files_micros: RecordedDuration,
    level0_deltas_count: usize,
    new_deltas_count: usize,
@@ -3362,6 +3363,10 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_held_spawn_blocking_startup_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
+            read_lock_held_key_sort_micros: value
+                .read_lock_held_key_sort_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
            read_lock_held_prerequisites_micros: value
                .read_lock_held_prerequisites_micros
                .into_recorded()
@@ -3374,10 +3379,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_drop_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            prepare_iterators_micros: value
-                .prepare_iterators_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
            write_layer_files_micros: value
                .write_layer_files_micros
                .into_recorded()
@@ -3547,28 +3548,24 @@ impl Timeline {
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;

-        let mut all_value_refs = Vec::new();
        let mut all_keys = Vec::new();

-        for l in deltas_to_compact.iter() {
+        let downcast_deltas: Vec<_> = deltas_to_compact
+            .iter()
+            .map(|l| l.clone().downcast_delta_layer().expect("delta layer"))
+            .collect();
+        for dl in downcast_deltas.iter() {
            // TODO: replace this with an await once we fully go async
-            let delta = l.clone().downcast_delta_layer().expect("delta layer");
-            Handle::current().block_on(async {
-                all_value_refs.extend(delta.load_val_refs(ctx).await?);
-                all_keys.extend(delta.load_keys(ctx).await?);
-                anyhow::Ok(())
-            })?;
+            all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
        }

        // The current stdlib sorting implementation is designed in a way where it is
        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn));
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));

-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn));
+        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();

-        for (next_key, _next_lsn, _size) in all_keys.iter() {
+        for DeltaEntry { key: next_key, .. } in all_keys.iter() {
            let next_key = *next_key;
            if let Some(prev_key) = prev {
                // just first fast filter
@@ -3592,8 +3589,7 @@ impl Timeline {
            }
            prev = Some(next_key.next());
        }
-        stats.read_lock_held_compute_holes_micros =
-            stats.read_lock_held_prerequisites_micros.till_now();
+        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
        drop_rlock(guard);
        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
        let mut holes = heap.into_vec();
@@ -3602,12 +3598,26 @@ impl Timeline {

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
-        let all_values_iter = all_value_refs.into_iter();
+        let all_values_iter = all_keys.iter();

        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys.into_iter();
-
-        stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
+        let mut all_keys_iter = all_keys
+            .iter()
+            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
+            .coalesce(|mut prev, cur| {
+                // Coalesce keys that belong to the same key pair.
+                // This ensures that compaction doesn't put them
+                // into different layer files.
+                // Still limit this by the target file size,
+                // so that we keep the size of the files in
+                // check.
+                if prev.0 == cur.0 && prev.2 < target_file_size {
+                    prev.2 += cur.2;
+                    Ok(prev)
+                } else {
+                    Err((prev, cur))
+                }
+            });

        // Merge the contents of all the input delta layers into a new set
        // of delta layers, based on the current partitioning.
@@ -3662,8 +3672,11 @@ impl Timeline {

        // TODO remove this block_on wrapper once we fully go async
        Handle::current().block_on(async {
-            for (key, lsn, value_ref) in all_values_iter {
-                let value = value_ref.load().await?;
+            for &DeltaEntry {
+                key, lsn, ref val, ..
+            } in all_values_iter
+            {
+                let value = val.load().await?;
                let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
                // We need to check key boundaries once we reach next key or end of layer with the same key
                if !same_key || lsn == dup_end_lsn {
@@ -3764,6 +3777,19 @@ impl Timeline {

        // Sync layers
        if !new_layers.is_empty() {
+            // Print a warning if the created layer is larger than double the target size
+            // Add two pages for potential overhead. This should in theory be already
+            // accounted for in the target calculation, but for very small targets,
+            // we still might easily hit the limit otherwise.
+            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
+            for layer in new_layers.iter() {
+                if layer.desc.file_size > warn_limit {
+                    warn!(
+                        %layer,
+                        "created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
+                    );
+                }
+            }
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

            // Fsync all the layer files and directory using multiple threads to
@@ -3776,12 +3802,10 @@ impl Timeline {
            layer_paths.pop().unwrap();
        }

-        stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
+        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
        stats.new_deltas_count = Some(new_layers.len());
        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());

-        drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed
-
        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
        {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -450,6 +450,15 @@ impl<'a> WalIngest<'a> {
            let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
            if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
                let xlrec = XlHeapMultiInsert::decode(buf);
+
+                let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                    // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                    0
+                } else {
+                    std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                };
+                assert_eq!(offset_array_len, buf.remaining());
+
                if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                    new_heap_blkno = Some(decoded.blocks[0].blkno);
                }
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -270,67 +270,13 @@ pub struct XlHeapDelete {

 impl XlHeapDelete {
    pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
-        let neon_format = buf.remaining() == pg_constants::SIZE_OF_HEAP_DELETE;
-        let xmax = buf.get_u32_le();
-        let offnum = buf.get_u16_le();
-        let _padding;
-        let t_cid;
-        if neon_format {
-            _padding = buf.get_u16_le();
-            t_cid = buf.get_u32_le();
-        } else {
-            _padding = 0;
-            t_cid = 0;
-        }
-        let infobits_set = buf.get_u8();
-        let flags = buf.get_u8();
-        assert!(((flags & pg_constants::XLH_DELETE_STORE_CID) == 0) ^ neon_format);
        XlHeapDelete {
-            xmax,
-            offnum,
-            _padding,
-            t_cid,
-            infobits_set,
-            flags,
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlHeapLock {
-    pub locking_xid: TransactionId,
-    pub offnum: OffsetNumber,
-    pub _padding: u16,
-    pub t_cid: u32,
-    pub infobits_set: u8,
-    pub flags: u8,
-}
-
-impl XlHeapLock {
-    pub fn decode(buf: &mut Bytes) -> XlHeapLock {
-        let neon_format = buf.remaining() == pg_constants::SIZE_OF_HEAP_LOCK;
-        let locking_xid = buf.get_u32_le();
-        let offnum = buf.get_u16_le();
-        let _padding;
-        let t_cid;
-        if neon_format {
-            _padding = buf.get_u16_le();
-            t_cid = buf.get_u32_le();
-        } else {
-            _padding = 0;
-            t_cid = 0;
-        }
-        let infobits_set = buf.get_u8();
-        let flags = buf.get_u8();
-        assert!(((flags & pg_constants::XLH_LOCK_STORE_CID) == 0) ^ neon_format);
-        XlHeapLock {
-            locking_xid,
-            offnum,
-            _padding,
-            t_cid,
-            infobits_set,
-            flags,
+            xmax: buf.get_u32_le(),
+            offnum: buf.get_u16_le(),
+            _padding: buf.get_u16_le(),
+            t_cid: buf.get_u32_le(),
+            infobits_set: buf.get_u8(),
+            flags: buf.get_u8(),
        }
    }
 }
@@ -349,21 +295,12 @@ pub struct XlHeapUpdate {

 impl XlHeapUpdate {
    pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
-        let old_xmax = buf.get_u32_le();
-        let old_offnum = buf.get_u16_le();
-        let old_infobits_set = buf.get_u8();
-        let flags = buf.get_u8();
-        let t_cid = if (flags & pg_constants::XLH_UPDATE_STORE_CID) != 0 {
-            buf.get_u32()
-        } else {
-            0
-        };
        XlHeapUpdate {
-            old_xmax,
-            old_offnum,
-            old_infobits_set,
-            flags,
-            t_cid,
+            old_xmax: buf.get_u32_le(),
+            old_offnum: buf.get_u16_le(),
+            old_infobits_set: buf.get_u8(),
+            flags: buf.get_u8(),
+            t_cid: buf.get_u32(),
            new_xmax: buf.get_u32_le(),
            new_offnum: buf.get_u16_le(),
        }
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -74,7 +74,7 @@ walprop_connect_start(char *conninfo, char *password)
 	if (password)
 	{
 		keywords[n] = "password";
-		values[n] = neon_auth_token;
+		values[n] = password;
 		n++;
 	}
 	keywords[n] = "dbname";
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1393,8 +1393,22 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
+	char conninfo[MAXCONNINFO];

-	wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
+	if (!neon_auth_token)
+	{
+		memcpy(conninfo, safekeeper[donor].conninfo, MAXCONNINFO);
+	}
+	else
+	{
+		int written = 0;
+
+		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, safekeeper[donor].conninfo);
+		if (written > MAXCONNINFO || written < 0)
+			elog(FATAL, "could not append password to the safekeeper connection string");
+	}
+
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -13,6 +13,7 @@ bytes = { workspace = true, features = ["serde"] }
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+dashmap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -29,7 +30,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
-pbkdf2.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -36,7 +36,18 @@ pub(super) async fn authenticate(
        AuthInfo::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);
-            let client_key = match flow.begin(scram).await?.authenticate().await? {
+
+            let auth_flow = flow.begin(scram).await.map_err(|error| {
+                warn!(?error, "error sending scram acknowledgement");
+                error
+            })?;
+
+            let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
+                warn!(?error, "error processing scram messages");
+                error
+            })?;
+
+            let client_key = match auth_outcome {
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
@@ -51,7 +62,6 @@ pub(super) async fn authenticate(
        }
    };

-    info!("compute node's state has likely changed; requesting a wake-up");
    let mut num_retries = 0;
    let mut node = loop {
        let wake_res = api.wake_compute(extra, creds).await;
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,10 +1,21 @@
 use anyhow::Context;
 use async_trait::async_trait;
-use parking_lot::Mutex;
+use dashmap::DashMap;
+use futures::future::poll_fn;
+use parking_lot::RwLock;
+use pbkdf2::{
+    password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
+    Params, Pbkdf2,
+};
 use pq_proto::StartupMessageParams;
-use std::fmt;
+use std::sync::atomic::{self, AtomicUsize};
 use std::{collections::HashMap, sync::Arc};
+use std::{
+    fmt,
+    task::{ready, Poll},
+};
 use tokio::time;
+use tokio_postgres::AsyncMessage;

 use crate::{auth, console};
 use crate::{compute, config};
@@ -13,8 +24,8 @@ use super::sql_over_http::MAX_RESPONSE_SIZE;

 use crate::proxy::ConnectMechanism;

-use tracing::error;
-use tracing::info;
+use tracing::{error, warn};
+use tracing::{info, info_span, Instrument};

 pub const APP_NAME: &str = "sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;
@@ -42,23 +53,44 @@ impl fmt::Display for ConnInfo {
 }

 struct ConnPoolEntry {
-    conn: tokio_postgres::Client,
+    conn: Client,
    _last_access: std::time::Instant,
 }

-// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
+// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
+    pools: HashMap<(String, String), DbUserConnPool>,
    total_conns: usize,
 }

+/// This is cheap and not hugely secure.
+/// But probably good enough for in memory only hashes.
+///
+/// Still takes 3.5ms to hash on my hardware.
+/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
+const PARAMS: Params = Params {
+    rounds: 10_000,
+    output_length: 32,
+};
+
+#[derive(Default)]
+pub struct DbUserConnPool {
+    conns: Vec<ConnPoolEntry>,
+    password_hash: Option<PasswordHashString>,
+}
+
 pub struct GlobalConnPool {
    // endpoint -> per-endpoint connection pool
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
+    global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
+
+    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
+    /// It's only used for diagnostics.
+    global_pool_size: AtomicUsize,

    // Maximum number of connections per one endpoint.
    // Can mix different (dbname, username) connections.
@@ -72,7 +104,8 @@ pub struct GlobalConnPool {
 impl GlobalConnPool {
    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
        Arc::new(Self {
-            global_pool: Mutex::new(HashMap::new()),
+            global_pool: DashMap::new(),
+            global_pool_size: AtomicUsize::new(0),
            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
            proxy_config: config,
        })
@@ -82,70 +115,125 @@ impl GlobalConnPool {
        &self,
        conn_info: &ConnInfo,
        force_new: bool,
-    ) -> anyhow::Result<tokio_postgres::Client> {
-        let mut client: Option<tokio_postgres::Client> = None;
+        session_id: uuid::Uuid,
+    ) -> anyhow::Result<Client> {
+        let mut client: Option<Client> = None;

+        let mut hash_valid = false;
        if !force_new {
-            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+            let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+            let mut hash = None;

            // find a pool entry by (dbname, username) if exists
-            let mut pool = pool.lock();
-            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
-            if let Some(pool_entries) = pool_entries {
-                if let Some(entry) = pool_entries.pop() {
-                    client = Some(entry.conn);
-                    pool.total_conns -= 1;
+            {
+                let pool = pool.read();
+                if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) {
+                    if !pool_entries.conns.is_empty() {
+                        hash = pool_entries.password_hash.clone();
+                    }
+                }
+            }
+
+            // a connection exists in the pool, verify the password hash
+            if let Some(hash) = hash {
+                let pw = conn_info.password.clone();
+                let validate = tokio::task::spawn_blocking(move || {
+                    Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash())
+                })
+                .await?;
+
+                // if the hash is invalid, don't error
+                // we will continue with the regular connection flow
+                if validate.is_ok() {
+                    hash_valid = true;
+                    let mut pool = pool.write();
+                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                        if let Some(entry) = pool_entries.conns.pop() {
+                            client = Some(entry.conn);
+                            pool.total_conns -= 1;
+                        }
+                    }
                }
            }
        }

        // ok return cached connection if found and establish a new one otherwise
-        if let Some(client) = client {
-            if client.is_closed() {
+        let new_client = if let Some(client) = client {
+            if client.inner.is_closed() {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(self.proxy_config, conn_info).await
+                connect_to_compute(self.proxy_config, conn_info, session_id).await
            } else {
                info!("pool: reusing connection '{conn_info}'");
-                Ok(client)
+                client.session.send(session_id)?;
+                return Ok(client);
            }
        } else {
            info!("pool: opening a new connection '{conn_info}'");
-            connect_to_compute(self.proxy_config, conn_info).await
+            connect_to_compute(self.proxy_config, conn_info, session_id).await
+        };
+
+        match &new_client {
+            // clear the hash. it's no longer valid
+            // TODO: update tokio-postgres fork to allow access to this error kind directly
+            Err(err)
+                if hash_valid && err.to_string().contains("password authentication failed") =>
+            {
+                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+                let mut pool = pool.write();
+                if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    entry.password_hash = None;
+                }
+            }
+            // new password is valid and we should insert/update it
+            Ok(_) if !force_new && !hash_valid => {
+                let pw = conn_info.password.clone();
+                let new_hash = tokio::task::spawn_blocking(move || {
+                    let salt = SaltString::generate(rand::rngs::OsRng);
+                    Pbkdf2
+                        .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt)
+                        .map(|s| s.serialize())
+                })
+                .await??;
+
+                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+                let mut pool = pool.write();
+                pool.pools
+                    .entry(conn_info.db_and_user())
+                    .or_default()
+                    .password_hash = Some(new_hash);
+            }
+            _ => {}
        }
+
+        new_client
    }

-    pub async fn put(
-        &self,
-        conn_info: &ConnInfo,
-        client: tokio_postgres::Client,
-    ) -> anyhow::Result<()> {
-        let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+    pub async fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
+        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);

        // return connection to the pool
-        let mut total_conns;
        let mut returned = false;
        let mut per_db_size = 0;
-        {
-            let mut pool = pool.lock();
-            total_conns = pool.total_conns;
+        let total_conns = {
+            let mut pool = pool.write();

-            let pool_entries: &mut Vec<ConnPoolEntry> = pool
-                .pools
-                .entry(conn_info.db_and_user())
-                .or_insert_with(|| Vec::with_capacity(1));
-            if total_conns < self.max_conns_per_endpoint {
-                pool_entries.push(ConnPoolEntry {
-                    conn: client,
-                    _last_access: std::time::Instant::now(),
-                });
+            if pool.total_conns < self.max_conns_per_endpoint {
+                // we create this db-user entry in get, so it should not be None
+                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    pool_entries.conns.push(ConnPoolEntry {
+                        conn: client,
+                        _last_access: std::time::Instant::now(),
+                    });

-                total_conns += 1;
-                returned = true;
-                per_db_size = pool_entries.len();
+                    returned = true;
+                    per_db_size = pool_entries.conns.len();

-                pool.total_conns += 1;
+                    pool.total_conns += 1;
+                }
            }
-        }
+
+            pool.total_conns
+        };

        // do logging outside of the mutex
        if returned {
@@ -157,25 +245,35 @@ impl GlobalConnPool {
        Ok(())
    }

-    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
+        // fast path
+        if let Some(pool) = self.global_pool.get(endpoint) {
+            return pool.clone();
+        }
+
+        // slow path
+        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+            pools: HashMap::new(),
+            total_conns: 0,
+        }));
+
        // find or create a pool for this endpoint
        let mut created = false;
-        let mut global_pool = self.global_pool.lock();
-        let pool = global_pool
+        let pool = self
+            .global_pool
            .entry(endpoint.clone())
            .or_insert_with(|| {
                created = true;
-                Arc::new(Mutex::new(EndpointConnPool {
-                    pools: HashMap::new(),
-                    total_conns: 0,
-                }))
+                new_pool
            })
            .clone();
-        let global_pool_size = global_pool.len();
-        drop(global_pool);

        // log new global pool size
        if created {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_add(1, atomic::Ordering::Relaxed)
+                + 1;
            info!(
                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
            );
@@ -187,11 +285,12 @@ impl GlobalConnPool {

 struct TokioMechanism<'a> {
    conn_info: &'a ConnInfo,
+    session_id: uuid::Uuid,
 }

 #[async_trait]
 impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = tokio_postgres::Client;
+    type Connection = Client;
    type ConnectError = tokio_postgres::Error;
    type Error = anyhow::Error;

@@ -200,7 +299,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(node_info, self.conn_info, timeout).await
+        connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
    }

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -213,7 +312,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
+    session_id: uuid::Uuid,
+) -> anyhow::Result<Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

@@ -244,17 +344,27 @@ async fn connect_to_compute(
        .await?
        .context("missing cache entry from wake_compute")?;

-    crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
+    crate::proxy::connect_to_compute(
+        &TokioMechanism {
+            conn_info,
+            session_id,
+        },
+        node_info,
+        &extra,
+        &creds,
+    )
+    .await
 }

 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
-) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
+    mut session: uuid::Uuid,
+) -> Result<Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

-    let (client, connection) = config
+    let (client, mut connection) = config
        .user(&conn_info.username)
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
@@ -263,11 +373,53 @@ async fn connect_to_compute_once(
        .connect(tokio_postgres::NoTls)
        .await?;

-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
+    let (tx, mut rx) = tokio::sync::watch::channel(session);
+
+    let conn_id = uuid::Uuid::new_v4();
+    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
+    span.in_scope(|| {
+        info!(%session, "new connection");
    });

-    Ok(client)
+    tokio::spawn(
+        poll_fn(move |cx| {
+            if matches!(rx.has_changed(), Ok(true)) {
+                session = *rx.borrow_and_update();
+                info!(%session, "changed session");
+            }
+
+            let message = ready!(connection.poll_message(cx));
+
+            match message {
+                Some(Ok(AsyncMessage::Notice(notice))) => {
+                    info!(%session, "notice: {}", notice);
+                    Poll::Pending
+                }
+                Some(Ok(AsyncMessage::Notification(notif))) => {
+                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    Poll::Pending
+                }
+                Some(Ok(_)) => {
+                    warn!(%session, "unknown message");
+                    Poll::Pending
+                }
+                Some(Err(e)) => {
+                    error!(%session, "connection error: {}", e);
+                    Poll::Ready(())
+                }
+                None => Poll::Ready(()),
+            }
+        })
+        .instrument(span)
+    );
+
+    Ok(Client {
+        inner: client,
+        session: tx,
+    })
+}
+
+pub struct Client {
+    pub inner: tokio_postgres::Client,
+    session: tokio::sync::watch::Sender<uuid::Uuid>,
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -16,6 +16,7 @@ use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
+use tracing::Instrument;
 use url::Url;

 use super::conn_pool::ConnInfo;
@@ -44,6 +45,7 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
 static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");
@@ -180,6 +182,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
+    session_id: uuid::Uuid,
 ) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
    //
    // Determine the destination and connection params
@@ -193,7 +196,7 @@ pub async fn handle(
    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);

    // Allow connection pooling only if explicitly requested
-    let allow_pool = false;
+    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

    // isolation level, read only and deferrable

@@ -229,18 +232,18 @@ pub async fn handle(
    let body = hyper::body::to_bytes(request.into_body()).await?;
    let payload: Payload = serde_json::from_slice(&body)?;

-    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;

    //
    // Now execute the query and return the result
    //
    let result = match payload {
-        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
+        Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
            .await
            .map(|x| (x, HashMap::default())),
        Payload::Batch(batch_query) => {
            let mut results = Vec::new();
-            let mut builder = client.build_transaction();
+            let mut builder = client.inner.build_transaction();
            if let Some(isolation_level) = txn_isolation_level {
                builder = builder.isolation_level(isolation_level);
            }
@@ -284,9 +287,12 @@ pub async fn handle(

    if allow_pool {
        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
-        });
+        tokio::task::spawn(
+            async move {
+                let _ = conn_pool.put(&conn_info, client).await;
+            }
+            .in_current_span(),
+        );
    }

    result
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -203,7 +203,7 @@ async fn ws_handler(
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
            .instrument(info_span!("sql-over-http"))
            .await;
        let status_code = match result {
@@ -307,7 +307,7 @@ pub async fn task_main(
                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                            .instrument(info_span!(
                                "ws-client",
-                                session = format_args!("{session_id}")
+                                session = %session_id
                            ))
                            .await
                    }
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -4,6 +4,7 @@ use super::{messages::ServerMessage, Mechanism};
 use crate::stream::PqStream;
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;

 /// Abstracts away all peculiarities of the libpq's protocol.
 pub struct SaslStream<'a, S> {
@@ -68,7 +69,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
    ) -> super::Result<Outcome<M::Output>> {
        loop {
            let input = self.recv().await?;
-            let step = mechanism.exchange(input)?;
+            let step = mechanism.exchange(input).map_err(|error| {
+                info!(?error, "error during SASL exchange");
+                error
+            })?;

            use super::Step;
            return Ok(match step {
--- a/scripts/plumber.py
+++ b/scripts/plumber.py
@@ -0,0 +1,581 @@
+import argparse
+import asyncio
+import enum
+import json
+import os
+import pprint
+import tempfile
+from asyncio import subprocess
+from datetime import date, datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+
+"""
+This is the automation tool that was mostly helpful during our big aws account migration,
+but may be helpful in other day to day tasks and concentrate knowledge about operations
+that can help during on-call.
+
+
+This script executes commands on remote using ssh multiplexing. See referenes:
+    https://blog.scottlowe.org/2015/12/11/using-ssh-multiplexing/
+    https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
+    https://github.com/openssh-rust/openssh/blob/master/src/process_impl/session.rs
+    https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
+    https://docs.rs/openssh/0.9.8/openssh/
+
+For use with teleport you'll need to setup nsh script mentioned here:
+https://github.com/neondatabase/cloud/wiki/Cloud%3A-access#3-access-the-nodes-with-ssm
+"""
+
+
+def show_line(output_label: Optional[str], line: str):
+    if output_label is not None:
+        print(f"({output_label})", line, end="")
+    else:
+        print("    ", line, end="")
+    if not line:
+        print()
+
+
+async def exec_checked(
+    program: str,
+    args: List[str],
+    err_msg: Optional[str] = None,
+    output_label: Optional[str] = None,
+    show_output: bool = True,
+    expected_exit_codes=frozenset((0,)),
+) -> List[str]:
+    if show_output:
+        print("+", program, *args)
+    proc = await subprocess.create_subprocess_exec(
+        program,
+        *args,
+        stdout=asyncio.subprocess.PIPE,
+        limit=10 << 20,
+    )
+
+    assert proc.stdout is not None
+
+    out = []
+
+    line = (await proc.stdout.readline()).decode()
+    if show_output:
+        show_line(output_label, line)
+
+    out.append(line)
+
+    while line:
+        line = (await proc.stdout.readline()).decode()
+        # empty line means eof, actual empty line from the program is represented by "\n"
+        if not line:
+            continue
+
+        if show_output:
+            show_line(output_label, line)
+        out.append(line)
+    exit_code = await proc.wait()
+    assert exit_code in expected_exit_codes, err_msg or f"{program} failed with {exit_code}"
+    return out
+
+
+class Connection:
+    def __init__(
+        self,
+        tempdir: tempfile.TemporaryDirectory,  # type: ignore
+        target: str,
+    ):
+        self.tempdir = tempdir
+        self.target = target
+
+    def get_args(self, extra_args: List[str]):
+        ctl_path = os.path.join(self.tempdir.name, "master")
+        return ["-S", ctl_path, "-o", "BatchMode=yes", *extra_args, "none"]
+
+    async def check(self):
+        args = self.get_args(["-O", "check"])
+        await exec_checked("ssh", args, err_msg="master check operation failed")
+
+    async def spawn(self, cmd: str):
+        # https://github.com/openssh-rust/openssh/blob/cd8f174fafc530d8e55c2aa63add14a24cb2b94c/src/process_impl/session.rs#L72
+        local_args = self.get_args(["-T", "-p", "9"])
+        local_args.extend(["--", f"bash -c '{cmd}'"])
+        return await exec_checked(
+            "ssh", local_args, err_msg="spawn failed", output_label=self.target
+        )
+
+    async def close(self):
+        args = self.get_args(["-O", "exit"])
+        await exec_checked("ssh", args, err_msg="master exit operation failed")
+
+
+async def connect(target: str) -> Connection:
+    """
+    target is directly passed to ssh command
+    """
+    # NOTE: it is mentioned that this setup is not secure
+    #     For better security it should be placed somewhere in ~/.ssh
+    #     or in other directory with proper permissions
+    #     openssh-rust does it the same way
+    #     https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
+    connection_dir = tempfile.TemporaryDirectory(suffix=".ssh-multiplexed")
+    # "-E logfile"
+    await exec_checked(
+        "ssh",
+        [
+            "-S",
+            os.path.join(connection_dir.name, "master"),
+            "-M",  # Places the ssh client into “master” mode for connection sharing.
+            "-f",  # Requests ssh to go to background just before command execution.
+            "-N",  # Do not execute a remote command. This is useful for just forwarding ports.
+            "-o",
+            "BatchMode=yes",
+            target,
+        ],
+        err_msg="starting master process failed",
+    )
+    return Connection(tempdir=connection_dir, target=target)
+
+
+class Timer:
+    def __init__(self, msg: str) -> None:
+        self.t0 = datetime.now()
+        self.msg = msg
+
+    def __enter__(self):
+        return None
+
+    def __exit__(self, *_):
+        print(self.msg, datetime.now() - self.t0)
+
+
+def parse_date(s: str) -> date:
+    return datetime.strptime(s, "%Y-%m-%d").date()
+
+
+def write_line(f, line: str):
+    f.write(line)
+    f.write("\n")
+
+
+async def pageserver_tenant_sizes(
+    pageserver_target: str, tenants_of_interest: Optional[List[str]] = None
+) -> Dict[str, int]:
+    """
+    With ondemand it should rather look at physical size api
+    For old projects since we dont have eviction yet,
+    we can look at local fs state.
+    """
+    if tenants_of_interest is not None:
+        tenants_of_interest = set(tenants_of_interest)  # type: ignore
+
+    ps_connection = await connect(pageserver_target)
+    out = await ps_connection.spawn("du -sb /storage/pageserver/data/tenants/* | sort -rh")
+
+    tenants = {}
+
+    for line in out:
+        if line.startswith("du: cannot read directory"):
+            continue
+
+        size, tenant_path = map(str.strip, line.split())
+        tenant = Path(tenant_path).stem
+        if tenants_of_interest is not None:
+            if tenant not in tenants_of_interest:
+                continue
+
+        tenants[tenant] = int(size)
+    return tenants
+
+
+async def fetch_ps_size(args):
+    if args.input is not None:
+        tenants = Path(args.input).read_text().splitlines()
+    else:
+        tenants = None
+
+    sizes = await pageserver_tenant_sizes(args.target, tenants_of_interest=tenants)
+
+    total = 0
+    for tenant, size in sorted(sizes.items(), key=lambda x: x[1], reverse=True):
+        total += size
+        print(tenant, size)
+    print("total", total)
+
+
+@enum.unique
+class Env(enum.Enum):
+    STAGING = "staging"
+    PRODUCTION = "production"
+
+
+class ConsoleAdminShortcuts:
+    def __init__(self, env: Env, verbose: bool = False):
+        if env is Env.STAGING:
+            self.admin_base_url = "https://console.neon.tech/api/v1"
+            self.management_base_url = "http://console-staging.local:3440/management/api/v2"
+        elif env is Env.PRODUCTION:
+            self.admin_base_url = "https://console.neon.tech"
+            self.management_base_url = "http://console-release.local:3441/management/api/v2"
+
+        self.api_token = os.getenv("CONSOLE_ADMIN_API_TOKEN")
+        assert self.api_token, '"CONSOLE_ADMIN_API_TOKEN" is missing in env'
+
+        self.verbose = verbose
+
+    async def check_availability(self, project_id: str):
+        url = f"{self.admin_base_url}/admin/projects/{project_id}/check_availability"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                "-XPOST",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+    async def get_operation(self, operation_id: str):
+        url = f"{self.admin_base_url}/admin/operations/{operation_id}"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+    async def get_pageservers(self):
+        url = f"{self.admin_base_url}/admin/pageservers"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+    async def set_maintenance(self, project_id: str, maintenance: bool) -> Dict[str, Any]:
+        """
+        Example response:
+        {
+            "project": {
+                "id": "tight-wood-864662",
+                "maintenance_set_at": "2023-01-31T13:36:45.90346Z"
+            },
+            "operations": [
+                {
+                "id": "216142e0-fbb7-4f41-a470-e63408d4d6b4"
+                }
+            ]
+        }
+        """
+        url = f"{self.management_base_url}/projects/{project_id}/maintenance"
+        data = json.dumps({"maintenance": maintenance})
+        if not self.verbose:
+            args = ["--silent"]
+        else:
+            args = []
+        args.extend(
+            [
+                "--fail",
+                "-XPUT",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+                "-d",
+                data,
+            ]
+        )
+        output = await exec_checked(
+            "curl",
+            [],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        ret = json.loads(output.pop())
+        assert isinstance(ret, Dict)
+        return ret
+
+    async def fetch_branches(self, project_id: str):
+        url = f"{self.admin_base_url}/admin/branches?project_id={project_id}"
+        output = await exec_checked(
+            "curl",
+            [
+                "--silent",
+                "--fail",
+                url,
+                "-H",
+                f"Authorization: Bearer {self.api_token}",
+                "-H",
+                "Accept: application/json",
+            ],
+            show_output=self.verbose,
+        )
+        assert len(output) == 1  # output should be one line of json
+        return json.loads(output.pop())
+
+
+async def poll_pending_ops(console: ConsoleAdminShortcuts, pending_ops: Set[str]):
+    finished = set()  # needed because sets cannot be changed during iteration
+    for pending_op in pending_ops:
+        data = await console.get_operation(pending_op)
+        operation = data["operation"]
+        status = operation["status"]
+        if status == "failed":
+            print(f"ERROR: operation {pending_op} failed")
+            continue
+
+        if operation["failures_count"] != 0:
+            print(f"WARN: operation {pending_op} has failures != 0")
+            continue
+
+        if status == "finished":
+            print(f"operation {pending_op} finished")
+            finished.add(pending_op)
+        else:
+            print(f"operation {pending_op} is still pending: {status}")
+
+    pending_ops.difference_update(finished)
+
+
+async def check_availability(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+    max_concurrent_checks = args.max_concurrent_checks
+
+    # reverse to keep the order because we will be popping from the end
+    projects: List[str] = list(reversed(Path(args.input).read_text().splitlines()))
+    print("n_projects", len(projects))
+
+    pending_ops: Set[str] = set()
+    while projects:
+        # walk through pending ops
+        if pending_ops:
+            print("pending", len(pending_ops), pending_ops)
+            await poll_pending_ops(console, pending_ops)
+
+        # schedule new ops if limit allows
+        while len(pending_ops) < max_concurrent_checks and len(projects) > 0:
+            project = projects.pop()
+            print("starting:", project, len(projects))
+            # there can be many operations, one for each endpoint
+            data = await console.check_availability(project)
+            for operation in data["operations"]:
+                pending_ops.add(operation["ID"])
+            # wait a bit before starting next one
+            await asyncio.sleep(2)
+
+        if projects:
+            # sleep a little bit to give operations time to finish
+            await asyncio.sleep(5)
+
+    print("all scheduled, poll pending", len(pending_ops), pending_ops, projects)
+    while pending_ops:
+        await poll_pending_ops(console, pending_ops)
+        await asyncio.sleep(5)
+
+
+async def maintain(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+    finish_flag = args.finish
+
+    projects: List[str] = Path(args.input).read_text().splitlines()
+    print("n_projects", len(projects))
+
+    pending_ops: Set[str] = set()
+
+    for project in projects:
+        data = await console.set_maintenance(project, maintenance=not finish_flag)
+        print(project, len(data["operations"]))
+        for operation in data["operations"]:
+            pending_ops.add(operation["id"])
+
+    if finish_flag:
+        assert len(pending_ops) == 0
+        return
+
+    print("all scheduled, poll pending", len(pending_ops), pending_ops)
+    while pending_ops:
+        await poll_pending_ops(console, pending_ops)
+        print("n pending ops:", len(pending_ops))
+        if pending_ops:
+            await asyncio.sleep(5)
+
+
+SOURCE_BUCKET = "zenith-storage-oregon"
+AWS_REGION = "us-west-2"
+SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET = "prod-1/wal"
+
+
+async def fetch_sk_s3_size(args):
+    tenants: List[str] = Path(args.input).read_text().splitlines()
+
+    total_objects = 0
+    total_size = 0
+    for tenant in tenants:
+        wal_prefix = f"s3://{SOURCE_BUCKET}/{SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET}/{tenant}"
+        result = await exec_checked(
+            "aws",
+            [
+                "--profile",
+                "neon_main",
+                "s3",
+                "ls",
+                "--recursive",
+                "--summarize",
+                wal_prefix,
+            ],
+            expected_exit_codes={0, 1},
+            show_output=False,
+        )
+        objects = int(result[-2].rsplit(maxsplit=1).pop())
+        total_objects += objects
+
+        size = int(result[-1].rsplit(maxsplit=1).pop())
+        total_size += size
+
+        print(tenant, "objects", objects, "size", size)
+
+    print("total_objects", total_objects, "total_size", total_size)
+
+
+async def fetch_branches(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+    project_id = args.project_id
+
+    pprint.pprint(await console.fetch_branches(project_id=project_id))
+
+
+async def get_pageservers(args):
+    console = ConsoleAdminShortcuts(env=Env(args.env))
+
+    pprint.pprint(await console.get_pageservers())
+
+
+async def main():
+    parser = argparse.ArgumentParser("migrator")
+    sub = parser.add_subparsers(title="commands", dest="subparser_name")
+
+    split_parser = sub.add_parser(
+        "split",
+    )
+    split_parser.add_argument(
+        "--input",
+        help="CSV file with results from snowflake query mentioned in README.",
+        required=True,
+    )
+    split_parser.add_argument(
+        "--out",
+        help="Directory to store groups of projects. Directory name is pageserver id.",
+        required=True,
+    )
+    split_parser.add_argument(
+        "--last-usage-cutoff",
+        dest="last_usage_cutoff",
+        help="Projects which do not have compute time starting from passed date (e g 2022-12-01) wil be considered not used recently",
+        required=True,
+    )
+    split_parser.add_argument(
+        "--select-pageserver-id",
+        help="Filter input for this pageserver id",
+        required=True,
+    )
+
+    fetch_ps_size_parser = sub.add_parser("fetch-ps-size")
+    fetch_ps_size_parser.add_argument(
+        "--target",
+        help="Target pageserver host as resolvable by ssh",
+        required=True,
+    )
+    fetch_ps_size_parser.add_argument(
+        "--input",
+        help="File containing list of tenants to include",
+    )
+
+    check_availability_parser = sub.add_parser("check-availability")
+    check_availability_parser.add_argument(
+        "--input",
+        help="File containing list of projects to run availability checks for",
+    )
+    check_availability_parser.add_argument(
+        "--env", choices=["staging", "production"], default="staging"
+    )
+    check_availability_parser.add_argument(
+        "--max-concurrent-checks",
+        help="Max number of simultaneously active availability checks",
+        type=int,
+        default=50,
+    )
+
+    maintain_parser = sub.add_parser("maintain")
+    maintain_parser.add_argument(
+        "--input",
+        help="File containing list of projects",
+    )
+    maintain_parser.add_argument("--env", choices=["staging", "production"], default="staging")
+    maintain_parser.add_argument(
+        "--finish",
+        action="store_true",
+    )
+
+    fetch_sk_s3_size_parser = sub.add_parser("fetch-sk-s3-size")
+    fetch_sk_s3_size_parser.add_argument(
+        "--input",
+        help="File containing list of tenants",
+    )
+
+    fetch_branches_parser = sub.add_parser("fetch-branches")
+    fetch_branches_parser.add_argument("--project-id")
+    fetch_branches_parser.add_argument(
+        "--env", choices=["staging", "production"], default="staging"
+    )
+
+    get_pageservers_parser = sub.add_parser("get-pageservers")
+    get_pageservers_parser.add_argument(
+        "--env", choices=["staging", "production"], default="staging"
+    )
+
+    args = parser.parse_args()
+
+    handlers = {
+        "fetch-ps-size": fetch_ps_size,
+        "check-availability": check_availability,
+        "maintain": maintain,
+        "fetch-sk-s3-size": fetch_sk_s3_size,
+        "fetch-branches": fetch_branches,
+        "get-pageservers": get_pageservers,
+    }
+
+    handler = handlers.get(args.subparser_name)
+    if handler:
+        await handler(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1769,6 +1769,15 @@ class VanillaPostgres(PgProtocol):
        with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
            conf_file.write("\n".join(options))

+    def edit_hba(self, hba: List[str]):
+        """Prepend hba lines into pg_hba.conf file."""
+        assert not self.running
+        with open(os.path.join(self.pgdatadir, "pg_hba.conf"), "r+") as conf_file:
+            data = conf_file.read()
+            conf_file.seek(0)
+            conf_file.write("\n".join(hba) + "\n")
+            conf_file.write(data)
+
    def start(self, log_path: Optional[str] = None):
        assert not self.running
        self.running = True
@@ -2166,15 +2175,18 @@ def static_proxy(
 ) -> Iterator[NeonProxy]:
    """Neon proxy that routes directly to vanilla postgres."""

-    # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
-
    port = vanilla_pg.default_options["port"]
    host = vanilla_pg.default_options["host"]
    dbname = vanilla_pg.default_options["dbname"]
    auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"

+    # require password for 'http_auth' user
+    vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"])
+
+    # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
+
    proxy_port = port_distributor.get_port()
    mgmt_port = port_distributor.get_port()
    http_port = port_distributor.get_port()
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,6 +1,8 @@
 import time
 from typing import TYPE_CHECKING, Any, Dict, Optional

+from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef
+
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.remote_storage import RemoteStorageKind, S3Storage
@@ -230,6 +232,24 @@ if TYPE_CHECKING:


 def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    response = list_prefix(neon_env_builder, prefix)
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
+def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    response = list_prefix(neon_env_builder, prefix)
+    assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
+
+
+def list_prefix(
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+) -> ListObjectsV2OutputTypeDef:
+    """
+    Note that this function takes into account prefix_in_bucket.
+    """
    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
    assert neon_env_builder.remote_storage_kind in (
        RemoteStorageKind.MOCK_S3,
@@ -239,15 +259,21 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
    assert isinstance(neon_env_builder.remote_storage, S3Storage)
    assert neon_env_builder.remote_storage_client is not None

+    prefix_in_bucket = neon_env_builder.remote_storage.prefix_in_bucket or ""
+    if not prefix:
+        prefix = prefix_in_bucket
+    else:
+        # real s3 tests have uniqie per test prefix
+        # mock_s3 tests use special pageserver prefix for pageserver stuff
+        prefix = "/".join((prefix_in_bucket, prefix))
+
    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Delimiter="/",
        Bucket=neon_env_builder.remote_storage.bucket_name,
-        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+        Prefix=prefix,
    )
-    objects = response.get("Contents")
-    assert (
-        response["KeyCount"] == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    return response


 def wait_tenant_status_404(
@@ -289,4 +315,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {


 def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
-    return 30 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10
+    return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -7,6 +7,9 @@ from pathlib import Path
 from typing import Dict, List, Optional, Union

 from fixtures.log_helper import log
+from fixtures.types import TenantId, TimelineId
+
+TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"


 class MockS3Server:
@@ -89,6 +92,19 @@ def available_s3_storages() -> List[RemoteStorageKind]:
 class LocalFsStorage:
    root: Path

+    def tenant_path(self, tenant_id: TenantId) -> Path:
+        return self.root / "tenants" / str(tenant_id)
+
+    def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+        return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
+
+    def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+        return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME
+
+    def index_content(self, tenant_id: TenantId, timeline_id: TimelineId):
+        with self.index_path(tenant_id, timeline_id).open("r") as f:
+            return json.load(f)
+

@dataclass
 class S3Storage:
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -394,13 +394,7 @@ def check_neon_works(
        test_output_dir / "dump-from-wal.filediff",
    )

-    # TODO: Run pg_amcheck unconditionally after the next release
-    try:
-        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
-    except subprocess.CalledProcessError:
-        log.info("Extension amcheck is not available, skipping pg_amcheck")
-    else:
-        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
+    pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])

    # Check that we can interract with the data
    pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -340,3 +340,50 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
    assert headers["Neon-Batch-Deferrable"] == "true"

    assert result[0]["rows"] == [{"answer": 42}]
+
+
+def test_sql_over_http_pool(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    def get_pid(status: int, pw: str) -> Any:
+        connstr = (
+            f"postgresql://http_auth:{pw}@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        )
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps(
+                {"query": "SELECT pid FROM pg_stat_activity WHERE state = 'active'", "params": []}
+            ),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Pool-Opt-In": "true",
+            },
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == status
+        return response.json()
+
+    pid1 = get_pid(200, "http")["rows"][0]["pid"]
+
+    # query should be on the same connection
+    rows = get_pid(200, "http")["rows"]
+    assert rows == [{"pid": pid1}]
+
+    # incorrect password should not work
+    res = get_pid(400, "foobar")
+    assert "password authentication failed for user" in res["message"]
+
+    static_proxy.safe_psql("alter user http_auth with password 'http2'")
+
+    # after password change, should open a new connection to verify it
+    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid1 != pid2
+
+    # query should be on an existing connection
+    pid = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid in [pid1, pid2]
+
+    # old password should not work
+    res = get_pid(400, "http")
+    assert "password authentication failed for user" in res["message"]
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_state,
 )
 from fixtures.remote_storage import (
+    TIMELINE_INDEX_PART_FILE_NAME,
    LocalFsStorage,
    RemoteStorageKind,
    available_remote_storages,
@@ -269,7 +270,7 @@ def test_remote_storage_upload_queue_retries(
                f"""
               INSERT INTO foo (id, val)
               SELECT g, '{data}'
-               FROM generate_series(1, 10000) g
+               FROM generate_series(1, 20000) g
               ON CONFLICT (id) DO UPDATE
               SET val = EXCLUDED.val
               """,
@@ -370,7 +371,7 @@ def test_remote_storage_upload_queue_retries(
    log.info("restarting postgres to validate")
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000


@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
@@ -418,7 +419,7 @@ def test_remote_timeline_client_calls_started_metric(
                f"""
               INSERT INTO foo (id, val)
               SELECT g, '{data}'
-               FROM generate_series(1, 10000) g
+               FROM generate_series(1, 20000) g
               ON CONFLICT (id) DO UPDATE
               SET val = EXCLUDED.val
               """,
@@ -509,7 +510,7 @@ def test_remote_timeline_client_calls_started_metric(
    log.info("restarting postgres to validate")
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000

    # ensure that we updated the calls_started download metric
    fetch_calls_started()
@@ -615,9 +616,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(

    # to please mypy
    assert isinstance(env.remote_storage, LocalFsStorage)
-    remote_timeline_path = (
-        env.remote_storage.root / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
-    )
+    remote_timeline_path = env.remote_storage.timeline_path(tenant_id, timeline_id)

    assert not list(remote_timeline_path.iterdir())

@@ -722,15 +721,14 @@ def test_empty_branch_remote_storage_upload_on_restart(
    # index upload is now hitting the failpoint, it should block the shutdown
    env.pageserver.stop(immediate=True)

-    timeline_path = (
-        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
-    )
-
-    local_metadata = env.repo_dir / timeline_path / "metadata"
+    local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
    assert local_metadata.is_file()

    assert isinstance(env.remote_storage, LocalFsStorage)
-    new_branch_on_remote_storage = env.remote_storage.root / timeline_path
+
+    new_branch_on_remote_storage = env.remote_storage.timeline_path(
+        env.initial_tenant, new_branch_timeline_id
+    )
    assert (
        not new_branch_on_remote_storage.exists()
    ), "failpoint should had prohibited index_part.json upload"
@@ -779,7 +777,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)

        assert (
-            new_branch_on_remote_storage / "index_part.json"
+            new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME
        ).is_file(), "uploads scheduled during initial load should had been awaited for"
    finally:
        create_thread.join()
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,5 +1,7 @@
 import enum
 import os
+import shutil
+from pathlib import Path

 import pytest
 from fixtures.log_helper import log
@@ -13,13 +15,18 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    MANY_SMALL_LAYERS_TENANT_CONFIG,
    assert_prefix_empty,
+    assert_prefix_not_empty,
    poll_for_remote_storage_iterations,
    tenant_delete_wait_completed,
    wait_tenant_status_404,
    wait_until_tenant_active,
    wait_until_tenant_state,
 )
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+    available_remote_storages,
+    available_s3_storages,
+)
 from fixtures.types import TenantId
 from fixtures.utils import run_pg_bench_small

@@ -64,6 +71,17 @@ def test_tenant_delete_smoke(
            run_pg_bench_small(pg_bin, endpoint.connstr())
            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)

+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(tenant_id),
+                        )
+                    ),
+                )
+
        parent = timeline

    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
@@ -73,7 +91,7 @@ def test_tenant_delete_smoke(
    tenant_path = env.tenant_dir(tenant_id=tenant_id)
    assert not tenant_path.exists()

-    if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]:
+    if remote_storage_kind in available_s3_storages():
        assert_prefix_empty(
            neon_env_builder,
            prefix="/".join(
@@ -189,6 +207,17 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
        else:
            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)

+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(tenant_id),
+                        )
+                    ),
+                )
+
    ps_http.configure_failpoints((failpoint, "return"))

    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
@@ -241,8 +270,12 @@ def test_delete_tenant_exercise_crash_safety_failpoints(

        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)

-    # Check remote is impty
-    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+    tenant_dir = env.tenant_dir(tenant_id)
+    # Check local is empty
+    assert not tenant_dir.exists()
+
+    # Check remote is empty
+    if remote_storage_kind in available_s3_storages():
        assert_prefix_empty(
            neon_env_builder,
            prefix="/".join(
@@ -253,10 +286,118 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
            ),
        )

-    tenant_dir = env.tenant_dir(tenant_id)
-    # Check local is empty
-    assert not tenant_dir.exists()
+
+# TODO resume deletion (https://github.com/neondatabase/neon/issues/5006)
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_deleted_tenant_ignored_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_deleted_tenant_ignored_on_attach",
+    )
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    ps_http = env.pageserver.http_client()
+    # create two timelines
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+    # sanity check, data should be there
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    # failpoint before we remove index_part from s3
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.extend(
+        (
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # error from http response is also logged
+            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
+            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
+        )
+    )
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.tenant_delete(tenant_id)
+
+    tenant_info = wait_until_tenant_state(
+        pageserver_http=ps_http,
+        tenant_id=tenant_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    reason = tenant_info["state"]["data"]["reason"]
+    # failpoint may not be the only error in the stack
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    # now we stop pageserver and remove local tenant state
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+
+    # now we call attach
+    with pytest.raises(
+        PageserverApiException, match="Tenant is marked as deleted on remote storage"
+    ):
+        ps_http.tenant_attach(tenant_id=tenant_id)
+
+    # delete should be resumed (not yet)
+    # wait_tenant_status_404(ps_http, tenant_id, iterations)
+
+    # we shouldn've created tenant dir on disk
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
+    assert not tenant_path.exists()
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )


 # TODO test concurrent deletions with "hang" failpoint
-# TODO test tenant delete continues after attach
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -7,7 +7,6 @@
 #

 import asyncio
-import json
 import os
 from pathlib import Path
 from typing import List, Tuple
@@ -225,10 +224,11 @@ def test_tenants_attached_after_download(
 # FIXME: test index_part.json getting downgraded from imaginary new version


-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_redownloads_truncated_file_on_startup(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+    neon_env_builder: NeonEnvBuilder,
 ):
+    remote_storage_kind = RemoteStorageKind.LOCAL_FS
+
    # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
@@ -237,6 +237,8 @@ def test_tenant_redownloads_truncated_file_on_startup(

    env = neon_env_builder.init_start()

+    assert isinstance(env.remote_storage, LocalFsStorage)
+
    env.pageserver.allowed_errors.append(
        ".*removing local file .* because it has unexpected length.*"
    )
@@ -279,7 +281,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    (path, expected_size) = local_layer_truncated

    # ensure the same size is found from the index_part.json
-    index_part = local_fs_index_part(env, tenant_id, timeline_id)
+    index_part = env.remote_storage.index_content(tenant_id, timeline_id)
    assert index_part["layer_metadata"][path.name]["file_size"] == expected_size

    ## Start the pageserver. It will notice that the file size doesn't match, and
@@ -309,7 +311,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"

    # the remote side of local_layer_truncated
-    remote_layer_path = local_fs_index_part_path(env, tenant_id, timeline_id).parent / path.name
+    remote_layer_path = env.remote_storage.timeline_path(tenant_id, timeline_id) / path.name

    # if the upload ever was ongoing, this check would be racy, but at least one
    # extra http request has been made in between so assume it's enough delay
@@ -334,27 +336,3 @@ def test_tenant_redownloads_truncated_file_on_startup(
    assert (
        os.stat(remote_layer_path).st_size == expected_size
    ), "truncated file should not had been uploaded after next checkpoint"
-
-
-def local_fs_index_part(env, tenant_id, timeline_id):
-    """
-    Return json.load parsed index_part.json of tenant and timeline from LOCAL_FS
-    """
-    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
-    with open(timeline_path, "r") as timeline_file:
-        return json.load(timeline_file)
-
-
-def local_fs_index_part_path(env, tenant_id, timeline_id):
-    """
-    Return path to the LOCAL_FS index_part.json of the tenant and timeline.
-    """
-    assert isinstance(env.remote_storage, LocalFsStorage)
-    return (
-        env.remote_storage.root
-        / "tenants"
-        / str(tenant_id)
-        / "timelines"
-        / str(timeline_id)
-        / "index_part.json"
-    )
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
+    assert_prefix_not_empty,
    poll_for_remote_storage_iterations,
    timeline_delete_wait_completed,
    wait_for_last_record_lsn,
@@ -27,8 +28,10 @@ from fixtures.pageserver.utils import (
    wait_until_timeline_state,
 )
 from fixtures.remote_storage import (
+    LocalFsStorage,
    RemoteStorageKind,
    available_remote_storages,
+    available_s3_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
@@ -211,6 +214,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        else:
            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)

+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(env.initial_tenant),
+                            "timelines",
+                            str(timeline_id),
+                        )
+                    ),
+                )
+
    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
    # It appears when we stopped flush loop during deletion and then pageserver is stopped
    env.pageserver.allowed_errors.append(
@@ -297,7 +313,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            ps_http, env.initial_tenant, timeline_id, iterations=iterations
        )

-    # Check remote is impty
+    # Check remote is empty
    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
        assert_prefix_empty(
            neon_env_builder,
@@ -738,6 +754,19 @@ def test_timeline_delete_works_for_remote_smoke(

        timeline_ids.append(timeline_id)

+    for timeline_id in timeline_ids:
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
    for timeline_id in reversed(timeline_ids):
        # note that we need to finish previous deletion before scheduling next one
        # otherwise we can get an "HasChildren" error if deletion is not fast enough (real_s3)
@@ -757,8 +786,65 @@ def test_timeline_delete_works_for_remote_smoke(

    # for some reason the check above doesnt immediately take effect for the below.
    # Assume it is mock server inconsistency and check twice.
-    wait_until(
-        2,
-        0.5,
-        lambda: assert_prefix_empty(neon_env_builder),
+    wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder))
+
+
+def test_delete_orphaned_objects(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    remote_storage_kind = RemoteStorageKind.LOCAL_FS
+    neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects")
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "image_creation_threshold": "100",
+        }
    )
+
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete")
+    with env.endpoints.create_start("delete") as endpoint:
+        # generate enough layers
+        pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
+        last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+
+    # write orphaned file that is missing from the index
+    remote_timeline_path = env.remote_storage.timeline_path(env.initial_tenant, timeline_id)
+    orphans = [remote_timeline_path / f"orphan_{i}" for i in range(3)]
+    for orphan in orphans:
+        orphan.write_text("I shouldnt be there")
+
+    # trigger failpoint after orphaned file deletion to check that index_part is not deleted as well.
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.timeline_delete(env.initial_tenant, timeline_id)
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=timeline_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    reason = timeline_info["state"]["Broken"]["reason"]
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    for orphan in orphans:
+        assert not orphan.exists()
+        assert env.pageserver.log_contains(
+            f"deleting a file not referenced from index_part.json name={orphan.stem}"
+        )
+
+    assert env.remote_storage.index_path(env.initial_tenant, timeline_id).exists()
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -543,8 +543,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
            last_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

            for sk in env.safekeepers:
-                # require WAL to be trimmed, so no more than one segment is left on disk
-                target_size_mb = 16 * 1.5
+                # require WAL to be trimmed, so no more than one segment is left
+                # on disk
+                # TODO: WAL removal uses persistent values and control
+                # file is fsynced roughly once in a segment, so there is a small
+                # chance that two segments are left on disk, not one. We can
+                # force persist cf and have 16 instead of 32 here.
+                target_size_mb = 32 * 1.5
                wait(
                    partial(is_wal_trimmed, sk, tenant_id, timeline_id, target_size_mb),
                    f"sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB",
@@ -976,6 +981,35 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder):
    connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=tenant_token)


+# Try restarting endpoint with enabled auth.
+def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
+    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table t(i int)")
+
+    # Restarting endpoints and random safekeepers, to trigger recovery.
+    for _i in range(3):
+        random_sk = random.choice(env.safekeepers)
+        random_sk.stop()
+
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                start = random.randint(1, 100000)
+                end = start + random.randint(1, 10000)
+                cur.execute("insert into t select generate_series(%s,%s)", (start, end))
+
+        endpoint.stop()
+        random_sk.start()
+        endpoint.start()
+
+
 class SafekeeperEnv:
    def __init__(
        self,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "2c76abf4d54b4d9e7ef5f4a86184f15747fb7138",
-    "postgres-v14": "71126b905c5000e1a12d96640c94df8c3ec7384a"
+    "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
+    "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
 }
Author	SHA1	Message	Date
Dmitry Rodionov	daac088c5e	add plumber tool	2023-08-18 19:33:45 +03:00
Arthur Petukhovsky	0b90411380	Fix safekeeper recovery with auth (#5035 ) Fix missing a password in walrcv_connect for a safekeeper recovery. Add a test which restarts endpoint and triggers a recovery.	2023-08-18 16:48:55 +01:00
Arpad Müller	f4da010aee	Make the compaction warning more tolerant (#5024 ) ## Problem The performance benchmark in `test_runner/performance/test_layer_map.py` is currently failing due to the warning added in #4888. ## Summary of changes The test mentioned has a `compaction_target_size` of 8192, which is just one page size. This is an unattainable goal, as we generate at least three pages: one for the header, one for the b-tree (minimally sized ones have just the root node in a single page), one for the data. Therefore, we add two pages to the warning limit. The warning text becomes a bit less accurate but I think this is okay.	2023-08-18 16:36:31 +02:00
Conrad Ludgate	ec10838aa4	proxy: pool connection logs (#5020 ) ## Problem Errors and notices that happen during a pooled connection lifecycle have no session identifiers ## Summary of changes Using a watch channel, we set the session ID whenever it changes. This way we can see the status of a connection for that session Also, adding a connection id to be able to search the entire connection lifecycle	2023-08-18 11:44:08 +01:00
Joonas Koivunen	67af24191e	test: cleanup remote_timeline_client tests (#5013 ) I will have to change these as I change remote_timeline_client api in #4938. So a bit of cleanup, handle my comments which were just resolved during initial review. Cleanup: - use unwrap in tests instead of mixed `?` and `unwrap` - use `Handle` instead of `&'static Reactor` to make the RemoteTimelineClient more natural - use arrays in tests - use plain `#[tokio::test]`	2023-08-17 19:27:30 +03:00
Joonas Koivunen	6af5f9bfe0	fix: format context (#5022 ) We return an error with unformatted `{timeline_id}`.	2023-08-17 14:30:25 +00:00
Dmitry Rodionov	64fc7eafcd	Increase timeout once again. (#5021 ) When failpoint is early in deletion process it takes longer to complete after failpoint is removed. Example was: https://neon-github-public-dev.s3.amazonaws.com/reports/main/5889544346/index.html#suites/3556ed71f2d69272a7014df6dcb02317/49826c68ce8492b1	2023-08-17 15:37:28 +03:00
Conrad Ludgate	3e4710c59e	proxy: add more sasl logs (#5012 ) ## Problem A customer is having trouble connecting to neon from their production environment. The logs show a mix of "Internal error" and "authentication protocol violation" but not the full error ## Summary of changes Make sure we don't miss any logs during SASL/SCRAM	2023-08-17 12:05:54 +01:00
Dmitry Rodionov	d8b0a298b7	Do not attach deleted tenants (#5008 ) Rather temporary solution before proper: https://github.com/neondatabase/neon/issues/5006 It requires more plumbing so lets not attach deleted tenants first and then implement resume. Additionally fix `assert_prefix_empty`. It had a buggy prefix calculation, and since we always asserted for absence of stuff it worked. Here I started to assert for presence of stuff too and it failed. Added more "presence" asserts to other places to be confident that it works. Resolves [#5016](https://github.com/neondatabase/neon/issues/5016)	2023-08-17 13:46:49 +03:00
Alexander Bayandin	c8094ee51e	test_compatibility: run amcheck unconditionally (#4985 ) ## Problem The previous version of neon (that we use in the forward compatibility test) has installed `amcheck` extension now. We can run `pg_amcheck` unconditionally. ## Summary of changes - Run `pg_amcheck` in compatibility tests unconditionally	2023-08-17 11:46:00 +01:00
Christian Schwarz	957af049c2	ephemeral file: refactor write_blob impl to concentrate mutable state (#5004 ) Before this patch, we had the `off` and `blknum` as function-wide mutable state. Now it's contained in the `Writer` struct. The use of `push_bytes` instead of index-based filling of the buffer also makes it easier to reason about what's going on. This is prep for https://github.com/neondatabase/neon/pull/4994	2023-08-17 13:07:25 +03:00
Anastasia Lubennikova	786c7b3708	Refactor remote extensions index download. Don't download ext_index.json from s3, but instead receive it as a part of spec from control plane. This eliminates s3 access for most compute starts, and also allows us to update extensions spec on the fly	2023-08-17 12:48:33 +03:00
Joonas Koivunen	d3612ce266	delta_layer: Restore generic from last week (#5014 ) Restores #4937 work relating to the ability to use `ResidentDeltaLayer` (which is an Arc wrapper) in #4938 for the ValueRef's by removing the borrow from `ValueRef` and providing it from an upper layer. This should not have any functional changes, most importantly, the `main` will continue to use the borrowed `DeltaLayerInner`. It might be that I can change #4938 to be like this. If that is so, I'll gladly rip out the `Ref` and move the borrow back. But I'll first want to look at the current test failures.	2023-08-17 11:47:31 +03:00
Christian Schwarz	994411f5c2	page cache: newtype the blob_io and ephemeral_file file ids (#5005 ) This makes it more explicit that these are different u64-sized namespaces. Re-using one in place of the other would be catastrophic. Prep for https://github.com/neondatabase/neon/pull/4994 which will eliminate the ephemeral_file::FileId and move the blob_io::FileId into page_cache. It makes sense to have this preliminary commit though, to minimize amount of new concept in #4994 and other preliminaries that depend on that work.	2023-08-16 18:33:47 +02:00
Conrad Ludgate	25934ec1ba	proxy: reduce global conn pool contention (#4747 ) ## Problem As documented, the global connection pool will be high contention. ## Summary of changes Use DashMap rather than Mutex<HashMap>. Of note, DashMap currently uses a RwLock internally, but it's partially sharded to reduce contention by a factor of N. We could potentially use flurry which is a port of Java's concurrent hashmap, but I have no good understanding of it's performance characteristics. Dashmap is at least equivalent to hashmap but less contention. See the read heavy benchmark to analyse our expected performance <https://github.com/xacrimon/conc-map-bench#ready-heavy> I also spoke with the developer of dashmap recently, and they are working on porting the implementation to use concurrent HAMT FWIW	2023-08-16 17:20:28 +01:00
Arpad Müller	0bdbc39cb1	Compaction: unify key and value reference vecs (#4888 ) ## Problem PR #4839 has already reduced the number of b-tree traversals and vec creations from 3 to 2, but as pointed out in https://github.com/neondatabase/neon/pull/4839#discussion_r1279167815 , we would ideally just traverse the b-tree once during compaction. Afer #4836, the two vecs created are one for the list of keys, lsns and sizes, and one for the list of `(key, lsn, value reference)`. However, they are not equal, as pointed out in https://github.com/neondatabase/neon/pull/4839#issuecomment-1660418012 and the following comment: the key vec creation combines multiple entries for which the lsn is changing but the key stays the same into one, with the size being the sum of the sub-sizes. In SQL, this would correspond to something like `SELECT key, lsn, SUM(size) FROM b_tree GROUP BY key;` and `SELECT key, lsn, val_ref FROM b_tree;`. Therefore, the join operation is non-trivial. ## Summary of changes This PR merges the two lists of keys and value references into one. It's not a trivial change and affects the size pattern of the resulting files, which is why this is in a separate PR from #4839 . The key vec is used in compaction for determining when to start a new layer file. The loop uses various thresholds to come to this conclusion, but the grouping via the key has led to the behaviour that regardless of the threshold, it only starts a new file when either a new key is encountered, or a new delta file. The new code now does the combination after the merging and sorting of the various keys from the delta files. This mostly does the same as the old code, except for a detail: with the grouping done on a per-delta-layer basis, the sorted and merged vec would still have multiple entries for multiple delta files, but now, we don't have an easy way to tell when a new input delta layer file is encountered, so we cannot create multiple entries on that basis easily. To prevent possibly infinite growth, our new grouping code compares the combined size with the threshold, and if it is exceeded, it cuts a new entry so that the downstream code can cut a new output file. Here, we perform a tradeoff however, as if the threshold is too small, we risk putting entries for the same key into multiple layer files, but if the threshold is too big, we can in some instances exceed the target size. Currently, we set the threshold to the target size, so in theory we would stay below or roughly at double the `target_file_size`. We also fix the way the size was calculated for the last key. The calculation was wrong and accounted for the old layer's btree, even though we already account for the overhead of the in-construction btree. Builds on top of #4839 .	2023-08-16 18:27:18 +03:00
Dmitry Rodionov	96b84ace89	Correctly remove orphaned objects in RemoteTimelineClient::delete_all (#5000 ) Previously list_prefixes was incorrectly used for that purpose. Change to use list_files. Add a test. Some drive by refactorings on python side to move helpers out of specific test file to be widely accessible resolves https://github.com/neondatabase/neon/issues/4499	2023-08-16 17:31:16 +03:00
Christian Schwarz	368b783ada	ephemeral_file: remove FileExt impl (was only used by tests) (#5003 ) Extracted from https://github.com/neondatabase/neon/pull/4994	2023-08-16 15:41:25 +02:00
Dmitry Rodionov	0f47bc03eb	Fix delete_objects in UnreliableWrapper (#5002 ) For `delete_objects` it was injecting failures for whole delete_objects operation and then for every delete it contains. Make it fail once for the whole operation.	2023-08-16 14:08:53 +03:00
Arseny Sher	fdbe8dc8e0	Fix test_s3_wal_replay flakiness. ref https://github.com/neondatabase/neon/issues/4466	2023-08-16 12:57:43 +03:00